In [95]:
# import data and transfer into dataframe
import os
rootdir = 'C:/material/Courses/stage 2/big data/assignment 3/data 2'

files_list = []


In [96]:
for subdir, dirs, files in os.walk(rootdir):
    for name in files:
        if "part" in name.lower() and not ".crc" in name.lower():
            files_list.append(os.path.join(subdir,name))
df = spark.read.json(sc.textFile(','.join(files_list)))


In [161]:
# frequency table
from pyspark.sql.functions import col
df.groupBy('label').count().orderBy(col('count').desc()).show()

+------+-----+
| label|count|
+------+-----+
|  safe|   18|
|vandal|   12|
|unsafe|    6|
+------+-----+



In [97]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [117]:

# making a diff between text_old and text_new
diff = make_diff(df.first().text_old, df.first().text_new)
df = df.withColumn("diff", lit(diff))

# drop useless features
drop_list = ['comment', 'name_user', 'title_page', 'url_page',"text_old", "text_new"]
df = df.select([column for column in df.columns if column not in drop_list])


In [114]:
#replace empty string  with 'null' 
from pyspark.sql.functions import col, when
df = df.withColumn("diff", when(col('diff') != ' ', col('diff')).otherwise(None))

In [119]:
# remove missing values
df = df.na.drop(subset = 'diff')

In [120]:
####################################################################
##################  count vector features ##########################
####################################################################

###### preprocessing pipeline ##############

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
import nltk

In [121]:
# split data into train and test data
(train, test) = df.randomSplit([0.8,0.2], seed = 775346)
print('train Data Count:' + str(train.count()))
print('test data count:' + str(test.count()))

train Data Count:26
test data count:10


In [124]:
## feature transformer: 
# tokenizer (split sentences into words)
regexTokenizer = RegexTokenizer(inputCol = 'diff', outputCol = 'words', pattern = '\\W')   

In [125]:
# remove the stop words
import nltk
nltk.download('stopwords')
stopwordList = nltk.corpus.stopwords.words('english')
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stopwordList)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fanqi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [136]:
# count bag of words
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=20000, minDF=0)

In [127]:
# define pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [128]:
# encoding dependent feature 'label'
label_stringIdx = StringIndexer(inputCol="label", outputCol="labelIndex")
labels_stars = label_stringIdx.fit(train).labels
indexed = label_stringIdx.fit(df).transform(df)

In [137]:
# create a pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [138]:
# fit the pipeline to train and test data seperately
pipelineFit = pipeline.fit(train)
train_pipe = pipelineFit.transform(train)
#train_pipe.show()
test_pipe = pipelineFit.transform(test)

In [22]:
#################### off-line model training #######################
########### logistic regression by using count vector features #####
####################################################################
from pyspark.ml.feature import IndexToString

In [14]:
# fit a model 
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)
lrModel = lr.fit(train_pipe)

In [15]:
# predict with test_pipe data
predictions = lrModel.transform(test_pipe)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)


In [147]:
#predictions.filter(predictions['prediction'] == 0)\
# #          .select('text_new','probability','label','predictedLabel')\
 ##          .orderBy('probability', ascending = False)\
 #          .show(n=20, truncate = 45)

In [17]:
# for multiple classification evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
evaluator.evaluate(predictions) # accuracy

evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
evaluator_F1.evaluate(predictions) # F1 score       

0.6923076923076923

In [18]:
# crossvalidation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)



In [19]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

In [20]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [21]:
cvModel = cv.fit(train_pipe)  # be careful, takes very long time

In [22]:
# predict with test_pipe data
predictions = cvModel.transform(test_pipe)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

In [23]:
# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction'，metricName='accuracy')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

Test set accuracy = 0.7217675941080196
Test set F1 score = 0.7217675941080196


In [146]:
#predictions.filter(predictions['prediction'] == 0)\
#          .select('text_new','probability','label','predictedLabel')\
#           .orderBy('probability', ascending = False)\
#           .show(n=20, truncate = 45)

In [24]:
####################################################################
########### Naive Bayes by using count vector features #############
####################################################################

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1, modelType="multinomial", featuresCol = 'features', labelCol = 'labelIndex' )

In [25]:
# train model with train_pipe data 
model = nb.fit(train_pipe)

In [26]:
# make predictions
predictions = model.transform(test_pipe)
predictions = labelConverter.transform(predictions)

# take a while to run this code, run it only if needed
# predictions.filter(predictions['prediction'] == 0)\
#           .select('text_new','probability','label','predictedLabel')\
#           .orderBy("probability", ascending=False)\
#          .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------+--------------+
|                      text_new|                   probability| label|predictedLabel|
+------------------------------+------------------------------+------+--------------+
|{{Use dmy dates|date=April ...|[1.0,6.972695125448627E-19,...|  safe|          safe|
|{{For|the federal election|...|[1.0,9.191506784201759E-113...|  safe|          safe|
|{{short description|Wikiped...|[0.9999990001846086,9.99815...|vandal|          safe|
|{{notability|1=Biographies|...|[0.9999954372568007,1.19131...|  safe|          safe|
|

{{short description||bot=...|[0.987731771298586,5.169991...|  safe|          safe|
|{{Infobox software
| name  ...|[0.8938136428475434,0.01979...|  safe|          safe|
|{{redirect|Patrick Lumumba|...|[0.5419811683574965,5.46365...|  safe|          safe|
|{{short description|GRID is...|[0.5293616161103452,2.42185...|  safe|          safe|
|'''Tswana''' may refer to:
...|[0.5149007383910856,0.

In [27]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_BYF1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_BYF1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

Test set accuracy = 0.34615384615384615
Test set F1 score = 0.41698717948717945


In [28]:
####################################################################
########### Random Forrest by using count vector features ##########
####################################################################
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="labelIndex", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)


In [29]:
# Train model with Train_pipe Data
rfModel = rf.fit(train_pipe)

In [30]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 500]) 
             .addGrid(rf.maxDepth, [4, 10, 20])
             .build())
              

In [31]:
#  Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [32]:
# Cross_validation train
cvmodel = cv.fit(train_pipe)  # be careful, takes very long time

In [33]:
# predict with test_pipe data
predictions = cvModel.transform(test_pipe)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))


Test set accuracy = 0.7217675941080196
Test set F1 score = 0.7217675941080196


In [18]:
####################################################################
###################### TF—IDF Features  ############################
####################################################################

from pyspark.ml.feature import HashingTF, IDF


In [19]:
# TF-IDF features
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq = 5)

pipeline_tfidf = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF , idf, label_stringIdx])

In [20]:
# appply pipeline to train and test data
pipelineFit_tfidf = pipeline_tfidf.fit(train)
train_pipe_tfidf = pipelineFit_tfidf.transform(train)

test_pipe_tfidf = pipelineFit_tfidf.transform(test)

In [23]:
#################### off-line model training #######################
########### logistic regression by using TF_IDF features ###########
####################################################################

# fit a model 
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)
lrModel = lr.fit(train_pipe_tfidf)

# predict with test data
predictions = lrModel.transform(test_pipe_tfidf)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

#predictions.filter(predictions['prediction'] == 0)\
#           .select('text_new','probability','label','predictedLabel')\
#           .orderBy('probability', ascending = False)\
#           .show(n=20, truncate = 45)

+---------------------------------------------+---------------------------------------------+------+--------------+
|                                     text_new|                                  probability| label|predictedLabel|
+---------------------------------------------+---------------------------------------------+------+--------------+
|[[File:Merton London UK labelled ward map ...|[1.0,2.949127344484627E-21,3.0564044018527...|  safe|          safe|
|{{For|the federal election|2020 United Sta...|[1.0,4.0061517241228436E-63,1.046061002925...|  safe|          safe|
|{{short description|Wikipedia list article...|[0.9999999999999998,9.677946128900213E-18,...|vandal|          safe|
|Cole sprouse, was born 2007-2020
Cole went...|[0.9999999988016179,1.1593483902299663E-9,...|vandal|          safe|
|{{Use dmy dates|date=April 2020}}
The [[Au...|[0.9999999966944775,1.0558498232836677E-11...|  safe|          safe|
|{{Infobox song
| name       = Dirt on My B...|[0.9998412206078008,2.896

In [26]:
# for multiple classification evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
evaluator.evaluate(predictions) # accuracy

evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
evaluator_F1.evaluate(predictions) # F1 score 

# crossvalidation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)
# cvModel = cv.fit(train_pipe_tfidf)  # be careful, takes very long time

In [None]:
cvModel = cv.fit(train_pipe_tfidf)  # be careful, takes very long time

In [28]:
# predict with test data
predictions = cvModel.transform(test_pipe_tfidf)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

In [31]:
# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

#predictions.filter(predictions['prediction'] == 0)\
#           .select('text_new','probability','label','predictedLabel')\
#           .orderBy('probability', ascending = False)\
#          .show(n=20, truncate = 45)


Test set accuracy = 0.6820512820512821
Test set F1 score = 0.6820512820512821
+---------------------------------------------+---------------------------------------------+------+--------------+
|                                     text_new|                                  probability| label|predictedLabel|
+---------------------------------------------+---------------------------------------------+------+--------------+
|{{For|the federal election|2020 United Sta...|[1.0,6.611809568529717E-19,2.0697655966562...|  safe|          safe|
|[[File:Merton London UK labelled ward map ...|[0.9999999147509985,3.6806845261450523E-8,...|  safe|          safe|
|{{Use dmy dates|date=April 2020}}
The [[Au...|[0.9928664912641214,0.002933667246850246,0...|  safe|          safe|
|{{short description|Wikipedia list article...|[0.9897320934267761,0.010231605693713394,3...|vandal|          safe|
|Cole sprouse, was born 2007-2020
Cole went...|[0.9875916052387613,0.010001452968845116,0...|vandal|          

In [32]:
####################################################################
########### Naive Bayes by using TF-IDF features #############
####################################################################

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1, modelType="multinomial", featuresCol = 'features', labelCol = 'labelIndex' )

# train model with train data 
model = nb.fit(train_pipe_tfidf)

# make predictions
predictions = model.transform(test_pipe_tfidf)
predictions = labelConverter.transform(predictions)

predictions.filter(predictions['prediction'] == 0)\
           .select('text_new','probability','label','predictedLabel')\
           .orderBy("probability", ascending=False)\
           .show(n = 10, truncate = 30)

+------------------------------+------------------------------+------+--------------+
|                      text_new|                   probability| label|predictedLabel|
+------------------------------+------------------------------+------+--------------+
|{{Use dmy dates|date=May 20...|[1.0,1.7243874070756806E-30...|unsafe|          safe|
|{{Expand Vietnamese|Lấp Vò|...|[1.0,2.3215327594100197E-40...|  safe|          safe|
|{{more citations needed|dat...|[1.0,4.763501085087394E-49,...|  safe|          safe|
|{{Infobox software
| name  ...|[1.0,7.83105209213923E-98,9...|  safe|          safe|
|{{Use dmy dates|date=Septem...|[1.0,4.7421126313870003E-13...|  safe|          safe|
|{{short description|GRID is...|[1.0,3.249697534681906E-163...|  safe|          safe|
|{{notability|1=Biographies|...|[1.0,1.2317670434709667E-21...|  safe|          safe|
|{{Use Australian English|da...|[1.0,1.13340600465693E-222,...|  safe|          safe|
|'''Atom transfer radical po...|[1.0,1.313441720845676

In [33]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_BYF1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_BYF1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

Test set accuracy = 0.6153846153846154
Test set F1 score = 0.6303939962476548


In [34]:
####################################################################
########### Random Forrest by using DF-IDF features ################
####################################################################
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="labelIndex", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Train Data
rfModel = rf.fit(train_pipe_tfidf)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 500]) 
             .addGrid(rf.maxDepth, [4, 10, 20])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [35]:
# Cross_validation train
cvmodel = cv.fit(train_pipe_tfidf)  # be careful, takes very long time

In [36]:
# predict with test data
predictions = cvModel.transform(test_pipe_tfidf)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

In [37]:
# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

Test set accuracy = 0.6820512820512821
Test set F1 score = 0.6820512820512821


In [139]:
####################################################################
###################### Word2Vec Features  ##########################
####################################################################

from pyspark.ml.feature import Word2Vec

In [140]:
# word2Vec
w2v = Word2Vec(vectorSize=3, minCount=0, inputCol="filtered", outputCol="features")

pipeline_w2v = Pipeline(stages=[regexTokenizer, stopwordsRemover, w2v, label_stringIdx])

In [141]:
# appply pipeline to train and test data
pipelineFit_w2v = pipeline_w2v.fit(train)
train_pipe_w2v = pipelineFit_w2v.transform(train)

test_pipe_w2v = pipelineFit_w2v.transform(test)

In [41]:
#################### off-line model training #######################
########### logistic regression by using Word2Vec features #########
####################################################################

# fit a model 
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)
lrModel = lr.fit(train_pipe_w2v)

# predict with test data
predictions = lrModel.transform(test_pipe_w2v)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

predictions.filter(predictions['prediction'] == 0)\
           .select('text_new','probability','label','predictedLabel')\
           .orderBy('probability', ascending = False)\
           .show(n=20, truncate = 45)

+---------------------------------------------+---------------------------------------------+------+--------------+
|                                     text_new|                                  probability| label|predictedLabel|
+---------------------------------------------+---------------------------------------------+------+--------------+
|{{Use dmy dates|date=April 2020}}
The [[Au...|[0.9606433524623734,0.03922165361234626,1....|  safe|          safe|
|{{Use dmy dates|date=May 2016}}
{{Use Brit...|[0.743546245253475,0.20646003457521092,0.0...|unsafe|          safe|
|[[File:Merton London UK labelled ward map ...|[0.7127614380612451,0.18800563205504853,0....|  safe|          safe|
|{{Infobox book 
  | name           =  Coll...|[0.7006490816116543,0.22582590247146758,0....|  safe|          safe|
|{{Drugbox
| Verifiedfields = changed
| Wat...|[0.6838535250703675,0.22862336277769776,0....|  safe|          safe|
|{{For|the federal election|2020 United Sta...|[0.6770435561599338,0.193

In [42]:
# for multiple classification evaluation
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
evaluator.evaluate(predictions) # accuracy

evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
evaluator_F1.evaluate(predictions) # F1 score 

# crossvalidation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
lr = LogisticRegression(featuresCol = 'features', labelCol = 'labelIndex', maxIter = 10, regParam = 0, elasticNetParam=0)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [43]:
cvModel = cv.fit(train_pipe_w2v)  # be careful, takes very long time

In [45]:
# predict with test data
predictions = cvModel.transform(test_pipe_w2v)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

predictions.filter(predictions['prediction'] == 0)\
           .select('text_new','probability','label','predictedLabel')\
           .orderBy('probability', ascending = False)\
           .show(n=20, truncate = 45)


Test set accuracy = 0.7217675941080196
Test set F1 score = 0.7217675941080196
+---------------------------------------------+---------------------------------------------+------+--------------+
|                                     text_new|                                  probability| label|predictedLabel|
+---------------------------------------------+---------------------------------------------+------+--------------+
|{{Use dmy dates|date=April 2020}}
The [[Au...|[0.6479369580226959,0.1920083302837148,0.1...|  safe|          safe|
|{{short description|GRID is a productivity...|[0.6093483555596151,0.20658114203790565,0....|  safe|          safe|
|[[File:Merton London UK labelled ward map ...|[0.6055444277226629,0.21408550431700488,0....|  safe|          safe|
|{{For|the federal election|2020 United Sta...|[0.6035922367205283,0.21551591107325252,0....|  safe|          safe|
|{{Use dmy dates|date=May 2016}}
{{Use Brit...|[0.6014709814163648,0.2196349664987885,0.1...|unsafe|          

In [48]:
####################################################################
########### Naive Bayes by using Word2Vec features #############
####################################################################

from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1, modelType="multinomial", featuresCol = 'features', labelCol = 'labelIndex' )

# train model with train data 
model = nb.fit(train_pipe_w2v)

# make predictions
predictions = model.transform(test_pipe_w2v)
predictions = labelConverter.transform(predictions)

predictions.filter(predictions['prediction'] == 0)\
           .select('text_new','probability','label','predictedLabel')\
           .orderBy("probability", ascending=False)\
           .show(n = 10, truncate = 30)

Py4JJavaError: An error occurred while calling o35268.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 4741.0 failed 1 times, most recent failure: Lost task 6.0 in stage 4741.0 (TID 344694, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.12187534458413023,-0.2043117031223297,-0.1748945412729762].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1$$anonfun$apply$6.apply(PairRDDFunctions.scala:172)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:176)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1.apply(NaiveBayes.scala:129)
	at org.apache.spark.ml.util.Instrumentation$$anonfun$11.apply(Instrumentation.scala:185)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:185)
	at org.apache.spark.ml.classification.NaiveBayes.trainWithLabelCheck(NaiveBayes.scala:129)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:118)
	at org.apache.spark.ml.classification.NaiveBayes.train(NaiveBayes.scala:78)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: Naive Bayes requires nonnegative feature values but found [-0.12187534458413023,-0.2043117031223297,-0.1748945412729762].
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.NaiveBayes$.requireNonnegativeValues(NaiveBayes.scala:235)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$4.apply(NaiveBayes.scala:144)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:168)
	at org.apache.spark.ml.classification.NaiveBayes$$anonfun$trainWithLabelCheck$1$$anonfun$7.apply(NaiveBayes.scala:166)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$aggregateByKey$1$$anonfun$apply$6.apply(PairRDDFunctions.scala:172)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:189)
	at org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:188)
	at org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:144)
	at org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:62)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_BYF1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_BYF1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

In [142]:
####################################################################
########### Random Forrest by using Word2Vec features ##############
####################################################################
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="labelIndex", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# Train model with Train Data
rfModel = rf.fit(train_pipe_w2v)

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [100, 200, 500]) 
             .addGrid(rf.maxDepth, [4, 10, 20])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [50]:
# Cross_validation train
cvmodel = cv.fit(train_pipe_w2v)  # be careful, takes very long time

In [51]:
# predict with test data
predictions = cvModel.transform(test_pipe_w2v)
# transform back from index to original coding
labelConverter = IndexToString(inputCol = "prediction", outputCol = 'PredictedLabel', labels = labels_stars)
predictions = labelConverter.transform(predictions)

In [52]:
# evaluate best model---accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol = 'prediction')
accuracy = evaluator_accuracy.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

# F1 score
evaluator_F1= MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='labelIndex', metricName='f1')
F1 = evaluator_F1.evaluate(predictions)
print("Test set F1 score = " + str(F1))

Test set accuracy = 0.7217675941080196
Test set F1 score = 0.7217675941080196


In [145]:
#### saving complete pipeline for online prediction #### RF model using W2V

pipeline_rf_final = Pipeline(stages = [regexTokenizer, stopwordsRemover, w2v, label_stringIdx, rf, labelConverter])
pipeline_rf_export = pipeline_rf_final.fit(train)

In [160]:
    # export pipeline
    pipeline_rf_export.save('pipeline_rf')

Py4JJavaError: An error occurred while calling o51797.save.
: java.io.IOException: Path pipeline_rf1 already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:702)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:179)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [149]:

####################################################################
#################### online prediction #############################
####################################################################


In [150]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [151]:
from threading import Thread

class StreamingThread(Thread):
    def __init__(self, ssc):
        Thread.__init__(self)
        self.ssc = ssc
    def run(self):
        ssc.start()
        ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [152]:
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [153]:
from difflib import unified_diff

def make_diff(old, new):
    return '\n'.join([ l for l in unified_diff(old.split('\n'), new.split('\n')) if l.startswith('+') or l.startswith('-') ])

In [154]:
globals()['models_loaded'] = False

def predict(df):
    if any([x in df.diff.lower() for x in ['bad', 'lol', 'joke']]):
        return 'vandal'
    else:
        return 'safe'

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Tip: making a diff will probably help a lot as a feature in any model:
    #diff = make_diff(df.first().text_old, df.first().text_new)
    #df_withdiff = df.withColumn("diff", lit(diff))
    #df_withdiff.select('diff').show()
    
    
    
    # making a diff between text_old and text_new
    diff = make_diff(df.first().text_old, df.first().text_new)
    df = df.withColumn("diff", lit(diff))
    
    drop_list = ['comment', 'name_user', 'title_page', 'url_page',"text_old", "text_new"]
    df = df.select([column for column in df.columns if column not in drop_list])

    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = 'pipeline_rf' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model: 
    df_result = globals()['my_model'].transform(df)
    df_result.show()

In [155]:
ssc = StreamingContext(sc, 10)

In [156]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [158]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+--------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|       name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+
|tweak template op...| safe|     Archon 2488|{{pp-semi-indef}}...|{{pp-semi-indef}}...|    Gilgit-Baltistan|//en.wikipedia.or...|
|→‎2019–present: S...| safe|TheRedundancy125|{{Use mdy dates|d...|{{Use mdy dates|d...|            Maroon 5|//en.wikipedia.or...|
|4 star rating fro...| safe|         Muso805|{{Infobox album
|...|{{Infobox album
|...|Thelonious in Action|//en.wikipedia.or...|
|Fixed a typo foun...| safe|     Ira Leviton|{{YYYY music|1899...|{{YYYY music|1899...|       1899 in music|//en.wikipedia.or...|
|Changed unsupport...| safe|       John B123|{{short descripti...|{{short descripti...|Tha

In [159]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
+--------------------+-----+---------------+--------------------+--------------------+--------------------+--------------------+
|             comment|label|      name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+-----+---------------+--------------------+--------------------+--------------------+--------------------+
|Tidy census wordi...| safe|Rich Farmbrough|{{For|the locatio...|{{For|the locatio...|Vallecito, Califo...|//en.wikipedia.or...|
|        →‎References| safe|     Mohsen1248|{{Infobox sports ...|{{Infobox sports ...|Chess at the 2007...|//en.wikipedia.or...|
|Tidy census wordi...| safe|Rich Farmbrough|{{Infobox settlem...|{{Infobox settlem...|Mountain Ranch, C...|//en.wikipedia.or...|
+--------------------+-----+---------------+--------------------+--------------------+--------------------+--------------------+

