In [7]:
import numpy as np
import os
from threading import Thread
# pyspark
from pyspark.ml import PipelineModel 
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, regexp_replace, lower
from pyspark.sql.types import StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql import types

In [2]:
import import_ipynb
import pipeline1
%run "pipeline1.ipynb"

importing Jupyter notebook from pipeline1.ipynb


In [3]:
locale = sc._jvm.java.util.Locale
print(locale.getDefault()) ##en_CN
locale.setDefault(locale.forLanguageTag("en-US"))
print(dir(locale))
locale.setDefault(locale.forLanguageTag("en-US"))
print(locale.getDefault()) #en_US

en_CN
['Builder', 'CANADA', 'CANADA_FRENCH', 'CHINA', 'CHINESE', 'Category', 'ENGLISH', 'FRANCE', 'FRENCH', 'FilteringMode', 'GERMAN', 'GERMANY', 'ITALIAN', 'ITALY', 'JAPAN', 'JAPANESE', 'KOREA', 'KOREAN', 'LanguageRange', 'PRC', 'PRIVATE_USE_EXTENSION', 'ROOT', 'SIMPLIFIED_CHINESE', 'TAIWAN', 'TRADITIONAL_CHINESE', 'UK', 'UNICODE_LOCALE_EXTENSION', 'US', 'filter', 'filterTags', 'forLanguageTag', 'getAvailableLocales', 'getDefault', 'getISOCountries', 'getISOLanguages', 'lookup', 'lookupTag', 'setDefault']
en_US


In [4]:
# global variables in CAPITAL letters
PATH_LOAD_MODEL = "../output/models/logistic_regression"
SAVE_PREDICTIONS = True # save predictions
PATH_SAVE_PREDICTIONS = "/..output/predictions/logistic_regression.parquet" # where to save the predictions
VERBOSE = True # print intermediate output

In [8]:
class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    # Start stream
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    # Stop stream
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [9]:
# df_pipe = pipeline.fit(df)
# df_pipe.write().overwrite().save("../output/models/logistic_regression")
pipeline2 = PipelineModel.load('../output/models/logistic_regression')

In [10]:
def final_pipeline(time, rdd, save_predictions=SAVE_PREDICTIONS, path=PATH_SAVE_PREDICTIONS, verbose=VERBOSE):
    if rdd.isEmpty():
        return
    if verbose:
        print("========= %s =========" % str(time))

    # -------------------------------------------------------------------------
    # 1) Pipeline 1: Mostly data cleaning and getting data in the right shape
    # -------------------------------------------------------------------------
    df = spark.read.json(rdd)
    final_df = get_final_df(df)
    
    print(f"Result after pipeline 1:")
    if verbose:
        final_df.show()
        
    # ------------------------------------------------------------------------
    # 2) Pipeline 2: Feature engineering, and Predicting
    # ------------------------------------------------------------------------
    final_df_pipe = pipeline2.transform(final_df)
    print("Result after pipeline 2 (predictions):")
    if verbose:
        final_df_pipe.show()
    # save predictions
    if save_predictions: 
        print(f"saving predictions of batch in: {path}")
        final_df_pipe.write.mode("append").save(path)

# Begin online stream and make prediction per wiki edit

In [11]:
ssc = StreamingContext(sc, 10) # Get minibatches every 10 seconds

In [12]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(final_pipeline)

In [13]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{short descripti...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{short descripti...|[short, descripti...|[short, descripti...|(7181,[1,2,9,11,1...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
+-----+--------------------+--------------------+--------------------+--------------------+----------+----

Exception in thread Thread-6:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "<ipython-input-8-7618930d6cd3>", line 8, in run
    ssc.awaitTermination()
  File "/Users/zhengxin/Desktop/spark/spark-2.4.5-bin-hadoop2.7/python/pyspark/streaming/context.py", line 192, in awaitTermination
    self._jssc.awaitTermination()
  File "/Users/zhengxin/Desktop/spark/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/Users/zhengxin/Desktop/spark/spark-2.4.5-bin-hadoop2.7/python/pyspark/sql/utils.py", line 63, in deco
    return f(*a, **kw)
  File "/Users/zhengxin/Desktop/spark/spark-2.4.5-bin-hadoop2.7/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value
    format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o309.awa

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{for|the heated ...|
| safe|{{for|the heated ...|
| safe|{{for|the heated ...|
| safe|{{for|the heated ...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{for|the heated ...|[for, the, heated...|[heated, device, ...|(7181,[1,2,4,6,9,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{for|the he

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{short descripti...|
| safe|{{short descripti...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{short descripti...|[short, descripti...|[short, descripti...|(7181,[1,2,4,9,10...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{short descripti...|[short, descripti...|[short, descripti...|(7181,[

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|After [https://ww...|
| safe|After [https://ww...|
| safe|After [https://ww...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|After [https://ww...|[after, https, ww...|[https, www, suer...|(7181,[0,2,5,6,9,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|After [https://ww...|[after, https, ww...

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{Infobox rugby l...|
| safe|{{Infobox rugby l...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{Infobox rugby l...|[infobox, rugby, ...|[infobox, rugby, ...|(7181,[1,3,4,6,9,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{Infobox rugby l...|[infobox, rugby, ...|[infobox, rugby, ...|(7181,[

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{Use dmy dates|d...|
| safe|{{Use dmy dates|d...|
| safe|{{Use dmy dates|d...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{Use dmy dates|d...|[use, dmy, dates,...|[use, dmy, dates,...|(7181,[1,2,4,6,9,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{Use dmy dates|d...|[use, dmy, dates,...

Result after pipeline 1:
+------+--------------------+
| label|                diff|
+------+--------------------+
|unsafe|'''Simple Knowled...|
|  safe|'''Simple Knowled...|
|  safe|'''Simple Knowled...|
|unsafe|'''Simple Knowled...|
+------+--------------------+

Result after pipeline 2 (predictions):
+------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|unsafe|'''Simple Knowled...|[simple, knowledg...|[simple, knowledg...|(7181,[1,2,3,4,9,...|       1.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
|  safe

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|Located in [[San ...|
| safe|Located in [[San ...|
| safe|Located in [[San ...|
| safe|Located in [[San ...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|Located in [[San ...|[located, in, san...|[located, san, be...|(7181,[1,4,9,11,1...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|Located in [

+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{short descripti...|
| safe|{{short descripti...|
| safe|{{short descripti...|
| safe|{{short descripti...|
| safe|{{short descripti...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{short descripti...|[short, descripti...|[short, descripti...|(7181,[1,2,3,4,5,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{short 

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{good article}}
...|
| safe|{{good article}}
...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{good article}}
...|[good, article, u...|[good, article, u...|(7181,[1,2,3,4,5,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{good article}}
...|[good, article, u...|[good, article, u...|(7181,[

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{confused||Peopl...|
| safe|{{confused||Peopl...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{confused||Peopl...|[confused, people...|[confused, people...|(7181,[1,2,4,5,6,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{confused||Peopl...|[confused, people...|[confused, people...|(7181,[

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{DISPLAYTITLE:Ra...|
| safe|{{DISPLAYTITLE:Ra...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{DISPLAYTITLE:Ra...|[displaytitle, ra...|[displaytitle, ra...|(7181,[1,6,9,10,1...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{DISPLAYTITLE:Ra...|[displaytitle, ra...|[displaytitle, ra...|(7181,[

Result after pipeline 1:
+-----+--------------------+
|label|                diff|
+-----+--------------------+
| safe|{{for|the America...|
| safe|{{for|the America...|
| safe|{{for|the America...|
+-----+--------------------+

Result after pipeline 2 (predictions):
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
|label|                diff|               words|            filtered|            features|labelIndex|       rawPrediction|         probability|prediction|PredictedLabel|
+-----+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+--------------+
| safe|{{for|the America...|[for, the, americ...|[american, politi...|(7181,[0,1,2,3,4,...|       0.0|[0.77124625084286...|[0.59378458258075...|       0.0|          safe|
| safe|{{for|the America...|[for, the, americ...

In [None]:
ssc_t.stop()