In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [6]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [7]:
globals()['models_loaded'] = False

def predict(df):
    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
        return 'vandal'
    else:
        return 'safe'

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Tip: making a diff will probably help a lot as a feature in any model:
    diff = make_diff(df.first().text_old, df.first().text_new)
    df_withdiff = df.withColumn("diff", lit(diff))
    df_withdiff.select('diff').show()
    
    # Utilize our predict function
    df_withpreds = df_withdiff.withColumn("pred", predict_udf(
        struct([df_withdiff[x] for x in df_withdiff.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
    # But an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [8]:
ssc = StreamingContext(sc, 10)

In [9]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [10]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|Fixed a typo foun...| safe| Ira Leviton|{{Infobox company...|{{Infobox company...|  Ebony Fashion Fair|//en.wikipedia.or...|
| maintainedMOS:ORDER| safe|Assem Khidhr|{{Infobox philoso...|{{Infobox philoso...|Mohammed Abed al-...|//en.wikipedia.or...|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-In 1...|
|--- 

+++ 

-In 1...|
+--------------------+

+--------------------+-----+------------+--------------------+--------------------+--------------------+

In [11]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+-----+--------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|     name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+--------------+--------------------+--------------------+--------------------+--------------------+
|                    | safe|AnotherWhisker|{{short descripti...|{{short descripti...|2019–20 coronavir...|//en.wikipedia.or...|
|syntax, per Webst...| safe|    Nedhartley|{{BLP sources|dat...|{{BLP sources|dat...|           Jem Cohen|//en.wikipedia.or...|
|→‎Discography:del...| safe|        Michig|{{Use American En...|{{Use American En...|      Ivan & Alyosha|//en.wikipedia.or...|
+--------------------+-----+--------------+--------------------+--------------------+--------------------+--------------------+

+--------------------+
|                diff|
+----