In [9]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [10]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [11]:
sc

In [12]:
spark

In [13]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [14]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [15]:
globals()['models_loaded'] = False

def predict(df):
    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
        return 'vandal'
    else:
        return 'safe'

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Tip: making a diff will probably help a lot as a feature in any model:
    diff = make_diff(df.first().text_old, df.first().text_new)
    df_withdiff = df.withColumn("diff", lit(diff))
    df_withdiff.select('diff').show()
    
    # Utilize our predict function
    df_withpreds = df_withdiff.withColumn("pred", predict_udf(
        struct([df_withdiff[x] for x in df_withdiff.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
    # But an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [16]:
ssc = StreamingContext(sc, 10)

In [17]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [18]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|clean up, remove ...| safe|GiantSnowman|{{Infobox footbal...|{{Infobox footbal...|           Marco Sas|//en.wikipedia.or...|
|        →‎References| safe|     Moe1810|{{italic title}}
...|{{italic title}}
...|The Parent Trap (...|//en.wikipedia.or...|
|Removed old logo ...| safe|  GOiSMTPwik|{{Infobox award
|...|{{Infobox award
|...|Sami Rohr Prize f...|//en.wikipedia.or...|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-|   ...|
|--- 

+++ 

-|   ...|
|-

+--------------------+------+--------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|     name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------+--------------------+--------------------+--------------------+--------------------+
|→‎Places and geog...|  safe|       MattCHB|{{wiktionary|fram...|{{wiktionary|fram...|Fram (disambiguat...|//en.wikipedia.or...|
|                    |vandal|172.10.235.186|{{short descripti...|{{short descripti...|Subnautica Below ...|//en.wikipedia.or...|
|            clean up|  safe|  GiantSnowman|{{Use British Eng...|{{Use British Eng...|         Major Booth|//en.wikipedia.or...|
+--------------------+------+--------------+--------------------+--------------------+--------------------+--------------------+

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

+*[[F...|
|---

+------------+-----+------------+--------------------+--------------------+------------------+--------------------+
|     comment|label|   name_user|            text_new|            text_old|        title_page|            url_page|
+------------+-----+------------+--------------------+--------------------+------------------+--------------------+
|            | safe|TheBlinkster|{{Divine Mercy}}
...|{{Divine Mercy}}
...|Divine Mercy image|//en.wikipedia.or...|
|added a link| safe|    Blairall|{{distinguish|tex...|[[Image:SirWillia...|    Young baronets|//en.wikipedia.or...|
|  →‎See also| safe|Yewtharaptor|{{Infobox rockuni...|{{Infobox rockuni...|    Kota Formation|//en.wikipedia.or...|
+------------+-----+------------+--------------------+--------------------+------------------+--------------------+

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-Hyła...|
|--- 

+++ 

-Hyła...|
|--- 

+++ 

-Hyła...|
+--------------------+

+------------+-----+-----

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-Arth...|
|--- 

+++ 

-Arth...|
|--- 

+++ 

-Arth...|
+--------------------+

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|                diff|pred|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|        →‎Early life| safe| FeanorStar7|{{Use Irish Engli...|{{Use Irish Engli...|Arthur Wolfe, 1st...|//en.wikipedia.or...|--- 

+++ 

-Arth...|safe|
|   typo in city name| safe|  Runner1928|Urbanized areas i...|Urbanized areas i...|List of Midwester...|//en.wikipedia.or...|--- 

+++ 

-Arth...|safe|
|→‎Newport County:...| safe|GiantSnowman|{{EngvarB|date=Ju...|{{EngvarB|date=J

+--------------------+
|                diff|
+--------------------+
|--- 

+++ 

-Hurr...|
|--- 

+++ 

-Hurr...|
+--------------------+

+------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|           comment|label|       name_user|            text_new|            text_old|          title_page|            url_page|                diff|pred|
+------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+
|Reversed vandalism| safe|     Librarian42|{{short descripti...|{{short descripti...|     Hurricane David|//en.wikipedia.or...|--- 

+++ 

-Hurr...|safe|
|                  | safe|YatesTucker00090|{{Infobox serial ...|{{Infobox serial ...|Alexander Spesivtsev|//en.wikipedia.or...|--- 

+++ 

-Hurr...|safe|
+------------------+-----+----------------+--------------------+--------------------+------

In [19]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|               →‎Out| safe|       Bobly|{{Infobox footbal...|{{Infobox footbal...|1987–88 Real Madr...|//en.wikipedia.or...|
|→‎References:clea...| safe|GiantSnowman|{{Use dmy dates|d...|{{Use dmy dates|d...|Michael McHugh (f...|//en.wikipedia.or...|
|         →‎top:comma| safe|CaroleHenson|{{Use dmy dates|d...|{{Use dmy dates|d...|      Lund Cathedral|//en.wikipedia.or...|
|→‎Major intersect...| safe|MuzikMachine|{{Infobox road
|p...|{{Infobox road
|p...|Nova Scotia Trunk 22|//en.wikipedia.or...|
|+Category:Recycli...| safe|  Wil540 art|{{Use mdy dates|d...|{{Us