In [36]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [48]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [49]:
sc

In [50]:
spark

In [51]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [52]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [53]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LogisticRegressionModel
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.feature import HashingTF, IDF
    
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="diff", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=0)
label_stringIdx = StringIndexer(inputCol = "label_string", outputCol = "label")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=2) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

#pretrained model path
outpath = 'C:/Pr/AA_Big_Data/Assignment_3/spark/output'

In [54]:
globals()['models_loaded'] = False

#def predict(df):
#    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
#        return 'vandal'
#    else:
#        return 'safe'

#predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Tip: making a diff will probably help a lot as a feature in any model:
    diff = make_diff(df.first().text_old, df.first().text_new)
    df_withdiff = df.withColumn("diff", lit(diff))
    df_withdiff.select('diff').show()
    df_wd = df_withdiff.withColumnRenamed('label', 'label_string')
    pipelineFit = pipeline.fit(df_wd)
    dataset = pipelineFit.transform(df_wd)
    dataset.colnames()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict (you can)
    # But an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = LogisticRegressionModel.load(outpath) # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    df_result = globals()['my_model'].transform(dataset)
    df_result.filter(predictions['prediction'] == 0) \
    .select("diff","label_string","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show()
    #df_result.show()

In [55]:
ssc = StreamingContext(sc, 10)

In [56]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [57]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

In [59]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
