In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [3]:
sc

In [4]:
spark

In [5]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [6]:
# DIFFERENCE BETWEEN OLD AND NEW TEXT

import difflib

def make_diff(old, new):
    diff = difflib.ndiff(old, new)
    delta = ''.join(x[2:] for x in diff if x.startswith('- ') or x.startswith('+'))
    return delta

In [7]:
# LOAD IN THE SAVED PIPELINE

from pyspark.ml import PipelineModel

globals()['my_tfidf'] = PipelineModel.load('models/best/tfidf')

In [8]:
# LOAD IN THE MODEL IF NOT YET LOADED

from pyspark.ml.classification import LogisticRegressionModel

globals()['models_loaded'] = False

if not globals()['models_loaded']:
    globals()['my_model'] = LogisticRegressionModel.load('models/best/logistic_regression_with_weights_and_comment')
    globals()['models_loaded'] = True

In [9]:
# DEFINE PREDICTION FUNCTION

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Get difference between old and new texts
    udfmake_diff = F.udf(make_diff, StringType())
    df_difference = df.withColumn("difference", lit(udfmake_diff("text_old", "text_new")))
  
    # Change column name from 'label' to 'label_string' for convenience
    df_wd = df_difference.withColumnRenamed('label', 'label_string')
    
    # Fit the loaded pipeline
    dataset = globals()['my_tfidf'].transform(df_wd)
    dataset.select("difference", "comment", "features_diff", "features_comment", "features", "label").show()
       
    # Predict using the loaded model: 
    df_pred = globals()['my_model'].transform(dataset)
    df_final_prediction = df_pred.withColumn("prediction_string", F.when(df_pred.prediction == 0, "safe").when(df_pred.prediction == 1, "unsafe").otherwise("vandal"))
    df_final_prediction.select("comment", "difference", "probability", "label", "label_string", "prediction", "prediction_string").show()

In [10]:
# SELECT STREAMING INTERVAL
ssc = StreamingContext(sc, 10)

In [11]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [12]:
# CLASSIFY INCOMING EDITS
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|Adding localshort...| safe|Red Director|{{short descripti...|{{Infobox footbal...|        Masaya Honda|//en.wikipedia.or...|
|→‎Descriptions in...| safe|Gre regiment|[[File:Fight Pygm...|[[File:Fight Pygm...|Pygmy (Greek myth...|//en.wikipedia.or...|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+----------------

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(10000,[205,2626,...|(20000,[205,2626,...|  0.0|
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(10000,[205,2626,...|(20000,[205,2626,...|  0.0|
|                [[]]|Removed Taro hype...|       (10000,[],[])|(10000,[1512,3779...|(20000,[1512,3779...|  0.0|
|     l'Reddke Reddi'|                    |(10000,[4006,4848...|       (10000,[],[])|(20000,[14006,148...|  0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+

+--------------------+--------------------+--------------------+-----+------------+----------+-

+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------------+
|cleanup,typo(s) f...| safe| Kaltenmeyer|{{short descripti...|{{short descripti...|Enclosed religiou...|//en.wikipedia.or...|
|Adding localshort...| safe|Red Director|{{short descripti...|{{Use dmy dates|d...|      Daiki Matsuoka|//en.wikipedia.or...|
|Fixed a typo foun...| safe| Ira Leviton|{{About|the Frenc...|{{About|the Frenc...|      Accolay, Yonne|//en.wikipedia.or...|
|→‎Criticism:chang...| safe|  Geysirhead|{{short descripti...|{{short descripti...|             H-index|//en.wikipedia.or...|
+--------------------+-----+------------+--------------------+--------------------+--------------------+--------------

+--------------------+-----+---------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+---------+--------------------+--------------------+--------------------+--------------------+
|added references ...| safe|   Lime44|{{Infobox album
|...|{{Infobox album
|...|Last Flight Home ...|//en.wikipedia.or...|
+--------------------+-----+---------+--------------------+--------------------+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|[[|Tabla]]
== Ref...|added references ...|(1

+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|             comment|label| name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|Just have a wait ...| safe|LancedSoul|{{short descripti...|{{short descripti...|         Wish Dragon|//en.wikipedia.or...|
|→‎Example:Replace...| safe| Bender235|[[File:SubpixelCi...|[[File:SubpixelCi...|Sub-pixel resolution|//en.wikipedia.or...|
+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+-------

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|(→‎{{anchor|Histo...|unsafe|       49.205.34.203|{{cleanup|reason=...|{{cleanup|reason=...|History of Andhra...|//en.wikipedia.or...|
|→‎External links:...|  safe|Elizabeth Linden ...|{{for|the Radio S...|{{for|the Radio S...|Wright-Patterson ...|//en.wikipedia.or...|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+

+----------+--------------------+--------------------+--------------------+--------------------+-----+
|difference|             comment|       features_diff|    features_comment|           

+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|             comment|          difference|         probability|label|label_string|prediction|prediction_string|
+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
| (Everything  so so)|              

Oo00|[0.99999641936500...|  1.0|      unsafe|       0.0|             safe|
|→‎References:Repl...|}}[|url=}}The Mem...|[1.0,3.3707493781...|  0.0|        safe|       0.0|             safe|
+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+

+--------------------+-----+------------+--------------------+--------------------+-----------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|       title_page|            url_page|
+--------------------+-----+------------+--------------------+-------------

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(10000,[205,2626,...|(20000,[205,2626,...|  0.0|
|}}
{{For|the play...|                    |(10000,[15,1583,2...|       (10000,[],[])|(20000,[10015,115...|  0.0|
|proessrsonal acin...|→‎Game of Thrones...|(10000,[3919,4566...|(10000,[307,1125,...|(20000,[307,1125,...|  0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+

+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|             comment|          difference|         probability|label|label_string|prediction|p

+--------------------+-----+------------+--------------------+--------------------+-------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|   title_page|            url_page|
+--------------------+-----+------------+--------------------+--------------------+-------------+--------------------+
|Adding localshort...| safe|Red Director|{{short descripti...|{{Infobox footbal...|Naohiko Okada|//en.wikipedia.or...|
+--------------------+-----+------------+--------------------+--------------------+-------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(

+--------------------+-----+-------------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|          name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+-------------------+--------------------+--------------------+--------------------+--------------------+
|Adding localshort...| safe|       Red Director|{{short descripti...|{{Infobox footbal...|    Matsuichi Yamada|//en.wikipedia.or...|
|→‎Interim NXT Cru...| safe|Malcolm L. Mitchell|{{short descripti...|{{short descripti...|List of WWE tourn...|//en.wikipedia.or...|
|Adding localshort...| safe|       Red Director|{{short descripti...|{{Infobox footbal...|    Mitsuru Hasegawa|//en.wikipedia.or...|
|Adding localshort...| safe|       Red Director|{{short descripti...|{{Infobox footbal...|       Megumu Tamura|//en.wikipedia.or...|
+--------------------+-----+-------------------+--------------------+

+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|             comment|          difference|         probability|label|label_string|prediction|prediction_string|
+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|Adding localshort...|{{short descripti...|[1.0,1.3932052190...|  0.0|        safe|       0.0|             safe|
|Adding localshort...|{{short descripti...|[1.0,1.3932052190...|  0.0|        safe|       0.0|             safe|
|added citation fo...|<ref>Cirefneeded|...|[0.99999999999992...|  0.0|        safe|       0.0|             safe|
+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+

+--------------------+-----+------------+--------------------+--------------------+--------------+--------------------+
|             comment|label|   name_user|            text_new|            text_old|    t

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(10000,[205,2626,...|(20000,[205,2626,...|  0.0|
|                  
 |Remove space in w...|       (10000,[],[])|(10000,[1164,2288...|(20000,[1164,2288...|  0.0|
|  heavy metal music||v2.02 - Repaired ...|(10000,[353,2729,...|(10000,[353,650,2...|(20000,[353,650,2...|  0.0|
|{{short descripti...|Adding localshort...|(10000,[205,964,8...|(10000,[205,2626,...|(20000,[205,2626,...|  0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+

+--------------------+--------------------+--------------------+-----+------------+----------+-

+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|             comment|label| name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+
|v2.02 - Repaired ...| safe|Tassedethe|{{update|the 3 so...|{{update|the 3 so...|        Serj Tankian|//en.wikipedia.or...|
|→‎Compliance of n...| safe| Bender235|{{Electoral syste...|{{Electoral syste...|Comparison of ele...|//en.wikipedia.or...|
+--------------------+-----+----------+--------------------+--------------------+--------------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+-------

+--------------------+-----+---------+--------------------+--------------------+----------+--------------------+
|             comment|label|name_user|            text_new|            text_old|title_page|            url_page|
+--------------------+-----+---------+--------------------+--------------------+----------+--------------------+
|→‎Programming:Rem...| safe|  Keith D|{{Use dmy dates|d...|{{Use dmy dates|d...|Ridings FM|//en.wikipedia.or...|
+--------------------+-----+---------+--------------------+--------------------+----------+--------------------+

+----------+--------------------+-------------+--------------------+--------------------+-----+
|difference|             comment|features_diff|    features_comment|            features|label|
+----------+--------------------+-------------+--------------------+--------------------+-----+
|          |→‎Programming:Rem...|(10000,[],[])|(10000,[1164,2050...|(20000,[1164,2050...|  0.0|
+----------+--------------------+-------------+---

+----------------+--------------------+--------------------+-----+------------+----------+-----------------+
|         comment|          difference|         probability|label|label_string|prediction|prediction_string|
+----------------+--------------------+--------------------+-----+------------+----------+-----------------+
|→‎April–May:refs|| first1=Bryann| ...|[0.99999997422069...|  0.0|        safe|       0.0|             safe|
|                |[[Football League...|[0.48748384410233...|  1.0|      unsafe|       1.0|           unsafe|
+----------------+--------------------+--------------------+-----+------------+----------+-----------------+

+------------+-----+----------------+--------------------+--------------------+------------+--------------------+
|     comment|label|       name_user|            text_new|            text_old|  title_page|            url_page|
+------------+-----+----------------+--------------------+--------------------+------------+--------------------+
|→‎

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          ]]]]   ...|Cleaned up usingA...|(10000,[7456],[4....|(10000,[1792,5943...|(20000,[1792,5943...|  0.0|
|soternatelys grea...|CE (haven't touch...|(10000,[47,80,220...|(10000,[3964,6335...|(20000,[3964,6335...|  0.0|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+

+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|             comment|          difference|         probability|label|label_string|prediction|prediction_string|
+--------------------+--------------------+--------------------+-----+------------+----------+-

+--------------------+-------------------+--------------------+--------------------+--------------------+-----+
|          difference|            comment|       features_diff|    features_comment|            features|label|
+--------------------+-------------------+--------------------+--------------------+--------------------+-----+
|==People==
*[[Ala...|(→‎Popular culture)|(10000,[6169,7243...|(10000,[4692,6827...|(20000,[4692,6827...|  1.0|
|                   =|                   |       (10000,[],[])|       (10000,[],[])|       (20000,[],[])|  0.0|
+--------------------+-------------------+--------------------+--------------------+--------------------+-----+

+-------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|            comment|          difference|         probability|label|label_string|prediction|prediction_string|
+-------------------+--------------------+--------------------+-----+------------+----------+----------

+--------------------+-------------------+--------------------+-----+------------+----------+-----------------+
|             comment|         difference|         probability|label|label_string|prediction|prediction_string|
+--------------------+-------------------+--------------------+-----+------------+----------+-----------------+
|→‎Aircraft on dis...|           -–-–-–-–|[1.0,4.2946988290...|  0.0|        safe|       0.0|             safe|
|→‎Winton train:bo...|ftended lsl inended|[0.99999999999239...|  0.0|        safe|       0.0|             safe|
+--------------------+-------------------+--------------------+-----+------------+----------+-----------------+

+--------------------+-----+-------------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|          name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+-------------------+-------------

+--------------------+-----+---------+--------------------+--------------------+------------+--------------------+
|             comment|label|name_user|            text_new|            text_old|  title_page|            url_page|
+--------------------+-----+---------+--------------------+--------------------+------------+--------------------+
|→‎Archaeological ...| safe| BMacZero|{{pp-protected|sm...|{{pp-protected|sm...|Qumran Caves|//en.wikipedia.or...|
+--------------------+-----+---------+--------------------+--------------------+------------+--------------------+

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|             comment|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|[[]][[ dynasty|He...|→‎Archaeological ...|(10000,[419,4756]...|(10000,[3219,6952...|

In [13]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
