In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
sc

In [3]:
# LOAD SOME RELEVANT LIBRARIES -- OTHERS WILL BE LOADED WHEN NEEDED

from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [4]:
# LOOP THROUGH ALL FILES

import os
rootdir = 'save/myoutput'

files_list = []

for subdir, dirs, files in os.walk(rootdir):
    for name in files:
        if "part" in name.lower() and not ".crc" in name.lower():
            files_list.append(os.path.join(subdir,name))

In [5]:
# PUT ALL FILES TO ONE JSON DATAFRAME

df = spark.read.json(sc.textFile(','.join(files_list)))
df.show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                    |vandal|2601:c7:c201:c160...|{{cleanup-tense|d...|{{cleanup-tense|d...|Planning of the S...|//en.wikipedia.or...|
|      (→‎Early life)|vandal|      203.150.182.52|{{short descripti...|{{short descripti...|      John Krasinski|//en.wikipedia.or...|
|         (→‎History)|vandal|2001:ee0:40e1:498...|{{Short descripti...|{{Short descripti...|            Nike Mag|//en.wikipedia.or...|
|→‎Preseason All-S...|  safe|       Debartolo2917|{{short descripti...|{{short descripti...|2019 Kentucky Wil...|//en.wikipedia.or...|
|→‎Churches:Fixing...|  safe|              BD2412|{{sho

In [6]:
# SHOW SCHEMA OF THE DATASET

df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)



In [7]:
# DISTRIBUTION OF LABELS IN THE DATASET

from pyspark.sql.functions import col
df.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
| label|count|
+------+-----+
|  safe| 1405|
|unsafe|  233|
|vandal|   27|
+------+-----+



In [8]:
# TO SAMPLE FROM DATAFRAME (USE SAMPLEBY TO SPECIFY SAMPLED FRACTIONS OF EACH CLASS)
#import pyspark.sql
#df_sample = df.sample(False, fraction = 0.1, seed = 100)
#df_sample.show()

In [8]:
# DIFFERENCE BETWEEN OLD AND NEW TEXT

import difflib
import pyspark.sql.functions as F
from pyspark.sql.types import *

def make_diff(old, new):
    diff = difflib.ndiff(old, new)
    delta = ''.join(x[2:] for x in diff if x.startswith('- ') or x.startswith('+'))
    return delta

#convert to a UDF Function and get difference between columns
udfmake_diff = F.udf(make_diff, StringType())
df_difference = df.withColumn("difference", lit(udfmake_diff("text_old", "text_new")))
df_difference.show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|          difference|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                    |vandal|2601:c7:c201:c160...|{{cleanup-tense|d...|{{cleanup-tense|d...|Planning of the S...|//en.wikipedia.or...|aFollow me o tikt...|
|      (→‎Early life)|vandal|      203.150.182.52|{{short descripti...|{{short descripti...|      John Krasinski|//en.wikipedia.or...|       he is amazing|
|         (→‎History)|vandal|2001:ee0:40e1:498...|{{Short descripti...|{{Short descripti...|            Nike Mag|//en.wikipedia.or...|. and it's so stu...|
|→‎Preseason All-S...|  safe|       Debartolo2917|{{short descri

In [9]:
# CHANGE COLUMN NAME FROM 'LABEL' TO 'LABEL_STRING' -- LATER WE WILL CONVERT LABEL_STRING TO NUMERICAL VALUES AND GIVE THE COLUMN NAME 'LABEL'

df_wd = df_difference.withColumnRenamed('label', 'label_string')
#df_wd.show()

In [10]:
######################################################################################################
##
## NOTE !!!!!!
##
## CHOOSE TO RUN ONLY ONE OF THE FOLLOWING TWO CELLS
## DO NOT RUN THEM BOTH
##
##
#######################################################################################################

In [12]:
# EITHER RUN THIS CELL OR THE NEXT CELL (BUT NOT BOTH)
# THIS IS RUN ONLY TO TEST MODEL ACCURACY AND TUNE THE MODEL -- SKIP TO TRAIN MODEL ON THE WHOLE DATASET

# split dataset to train and test sets

(trainingData, testData) = df_wd.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


print("Distribution of labels in train set:")
trainingData.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

print("Distribution of labels in test set:")
testData.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


Training Dataset Count: 1175
Test Dataset Count: 490
Distribution of labels in train set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|  995|
|      unsafe|  161|
|      vandal|   19|
+------------+-----+

Distribution of labels in test set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|  410|
|      unsafe|   72|
|      vandal|    8|
+------------+-----+



In [75]:
# EITHER RUN THIS CELL OR THE CELL ABOVE (BUT NOT BOTH)
# THIS IS RUN TO TRAIN SELECTED MODEL ON THE ENTIRE DATASET

trainingData = df_wd

In [13]:
# IMPUTE NaN'S INSTEAD OF EMPTY STRINGS

#def processMissingCategory(s):
#    if s == "":
#        return "NaN"
#    else:
#        return s

#udfprocessMissingCategory = F.udf(processMissingCategory, StringType())
#df_nona = df_difference.withColumn('comment', lit(udfprocessMissingCategory('comment')))
#df_nona.show()

In [76]:
# TOKENIZING 'DIFFERENCE' COLUMN AND REMOVING CERTAIN STOP WORDS

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

# tokenize 'difference' column
regexTokenizer = RegexTokenizer(inputCol="difference", outputCol="words", pattern="\\W")
regexTokenizer_comment = RegexTokenizer(inputCol="comment", outputCol="words_comment", pattern="\\W")

# remove stop words
stop_words = ["http","https", "a", "an", "the", "about", "above", "after", "again", "against", "all", "am", "and", "any", "are", "aren't", "as", "at", "be", "because",
              "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", 
              "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll",
              "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is",
              "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only",
              "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't",
              "so", "some", "such", "than", "that", "that's", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've",
              "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't",
              "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)
stopwordsRemover_comment = StopWordsRemover(inputCol="words_comment", outputCol="filtered_comment").setStopWords(stop_words)

In [77]:
# CONVERT 'LABEL_STRING' TO NUMERIC COLUMN 'LABEL'

from pyspark.ml.feature import OneHotEncoder, StringIndexer

# index categorical variable
label_stringIdx = StringIndexer(inputCol = "label_string", outputCol = "label")

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## CHOOSE TO RUN ONLY ONE OF THE FOLLOWING TWO CELLS
## DO NOT RUN THEM BOTH
##
##
#######################################################################################################

In [64]:
# TF-IDF AND PIPELINE -- WITHOUT 'COMMENT'

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# do TF-IDF embeding
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# define the pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [78]:
# TF-IDF AND PIPELINE -- WITH 'COMMENT'

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# do TF-IDF embeding
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features_diff")

hashingTF_comment = HashingTF(inputCol="filtered_comment", outputCol="rawFeatures_comment", numFeatures=10000)
idf_comment = IDF(inputCol="rawFeatures_comment", outputCol="features_comment")

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["features_comment", "features_diff"], outputCol="features")

# define the pipeline
pipeline = Pipeline(stages=[regexTokenizer, regexTokenizer_comment, stopwordsRemover, stopwordsRemover_comment, hashingTF, hashingTF_comment, idf, idf_comment, 
                            assembler, label_stringIdx])

In [79]:
# FIT PIPELINE ON TRAIN DATASET

pipelineFit = pipeline.fit(trainingData)

In [80]:
# SAVE PIPELINE

#outpath = 'models/without_comment/tfidf'
#outpath = 'models/with_comment/tfidf'
outpath = 'models/best/tfidf'
pipelineFit.write().overwrite().save(outpath)

In [81]:
# USE PIPELINE ON TRAIN DATASET

dataset_tr = pipelineFit.transform(trainingData)

# without 'comment':
#dataset_tr.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
dataset_tr.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
dataset_tr.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
dataset_tr.select("features_diff", "features_comment", "features", "label").show()

+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|          difference|               words|            filtered|         rawFeatures|       features_diff|label|
+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|aFollow me o tikt...|[afollow, me, o, ...|[afollow, o, tikt...|(10000,[613,763,2...|(10000,[613,763,2...|  2.0|
|       he is amazing|   [he, is, amazing]|           [amazing]|(10000,[9258],[1.0])|(10000,[9258],[5....|  2.0|
|. and it's so stu...|[and, it, s, so, ...|         [s, stupid]|(10000,[2197,9121...|(10000,[2197,9121...|  2.0|
|                 Jr.|                [jr]|                [jr]|(10000,[9896],[1.0])|(10000,[9896],[5....|  0.0|
|       Augustinians||      [augustinians]|      [augustinians]|(10000,[2223],[1.0])|(10000,[2223],[4....|  0.0|
|                   s|                 [s]|                 [s]|(10000,[2197],[1.0])|(10000,[219

In [19]:
# USE PIPELINE ON TEST DATASET (ONLY DO IF THE SPLIT BETWEEN TRAIN AND TEST SETS WAS PERFORMED)

dataset_test = pipelineFit.transform(testData)

# without 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
#dataset_test.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
dataset_test.select("features_diff", "features_comment", "features", "label").show()

+--------------------+--------------------+--------------------+-----+
|       features_diff|    features_comment|            features|label|
+--------------------+--------------------+--------------------+-----+
|(10000,[9896],[5....|(10000,[1317,3521...|(20000,[1317,3521...|  0.0|
|(10000,[4315,5066...|(10000,[2066],[5....|(20000,[2066,1431...|  0.0|
|(10000,[1,502,585...|(10000,[7669],[4....|(20000,[7669,1000...|  0.0|
|(10000,[918,1958,...|       (10000,[],[])|(20000,[10918,119...|  0.0|
|       (10000,[],[])|(10000,[1350],[6....|(20000,[1350],[6....|  0.0|
|(10000,[1655],[3....|(10000,[658,1219,...|(20000,[658,1219,...|  1.0|
|(10000,[524,2710,...|(10000,[1636,2709...|(20000,[1636,2709...|  1.0|
|(10000,[98,174,20...|(10000,[4058,6469...|(20000,[4058,6469...|  1.0|
|(10000,[839],[6.3...|       (10000,[],[])|(20000,[10839],[6...|  0.0|
|       (10000,[],[])|(10000,[3624,3773...|(20000,[3624,3773...|  0.0|
|(10000,[2223],[4....|(10000,[20,650,28...|(20000,[20,650,28...|  0.0|
|(1000

In [82]:
# WE WILL USE LOGISTIC REGRESSION FOR PREDICTING

from pyspark.ml.classification import LogisticRegression

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## THE FOLLOWING PART IS FOR TESTING MODEL ACCURACY, MODEL TUNING AND DECIDING WHICH MODEL TO USE
## USE IT ONLY IF YOU SPLIT INITIAL DATASET TO TRAIN AND TEST SETS
## IN ORDER TO USE THE WHOLE DATASET AND TRAIN THE FINAL MODEL, SKIP UNTIL THE NEXT NOTICE
##
#######################################################################################################

In [37]:
# LOGISTIC REGRESSION WITHOUT WEIGHTS

# fit logistic regression
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20)
lrModel = lr.fit(dataset_tr)


# predict train set
predict_train = lrModel.transform(dataset_tr)
predict_train.select("comment", "difference","label_string","probability","label","prediction").show()

+--------------------+--------------------+------------+--------------------+-----+----------+
|             comment|          difference|label_string|         probability|label|prediction|
+--------------------+--------------------+------------+--------------------+-----+----------+
|                    |aFollow me o tikt...|      vandal|[2.99336024484039...|  2.0|       2.0|
|      (→‎Early life)|       he is amazing|      vandal|[1.01905124206611...|  2.0|       2.0|
|         (→‎History)|. and it's so stu...|      vandal|[2.24233585579803...|  2.0|       2.0|
|→‎Churches:Fixing...|       Augustinians||        safe|[0.99999999999998...|  0.0|       0.0|
|→‎Churches holdin...|                   s|        safe|[0.99999999999999...|  0.0|       0.0|
|                    |                  39|      unsafe|[0.11500521988558...|  1.0|       1.0|
|- 5 categories us...|English-language ...|        safe|[1.0,7.9248811491...|  0.0|       0.0|
|→‎Bibliography:am...|| last = Coogan |...|       

In [38]:
# predict test set

predict_test = lrModel.transform(dataset_test)
predict_test.select("comment", "difference","label_string","probability","label","prediction").show()

+--------------------+--------------------+------------+--------------------+-----+----------+
|             comment|          difference|label_string|         probability|label|prediction|
+--------------------+--------------------+------------+--------------------+-----+----------+
|→‎Preseason All-S...|                 Jr.|        safe|[0.99999999999792...|  0.0|       0.0|
|            →‎Roster|      Jr. Jr.|link=y|        safe|[0.99999999989095...|  0.0|       0.0|
|             grammar|must s probably b...|        safe|[0.99999949923676...|  0.0|       0.0|
|                    |[[enmark|IFPI D]]...|        safe|[3.40668019160156...|  0.0|       1.0|
|          →‎Examples|                  

|        safe|[0.99347251932481...|  0.0|       0.0|
|(Reverted 1 edit ...|                 n a|      unsafe|[0.96886453107520...|  1.0|       0.0|
|(reference is ina...|>|ref1=<ref>https...|      unsafe|[0.99999999207570...|  1.0|       0.0|
|(→‎Other famous B...|
* [[Kemal Reis]]...|      u

In [39]:
# SAVE LOGISTIC REGRESSION WITHOUT WEIGHTS

#outpath = 'models/without_comment/logistic_regression'
#outpath = 'models/with_comment/logistic_regression'

#lrModel.write().overwrite().save(outpath)

In [None]:
# LOAD LOGISTIC REGRESSION WITHOUT WEIGHTS

#from pyspark.ml.classification import LogisticRegressionModel

#LogisticRegressionModel.load('models/without_comment/logistic_regression')
#LogisticRegressionModel.load('models/with_comment/logistic_regression')

In [40]:
# EVALUATE OVERALL MODEL ACCURACY FOR TRAIN AND TEST SETS

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train)))
print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test)))

overall accuracy train set: 0.9948936170212765
overall accuracy test set: 0.8061224489795918


In [65]:
# EVALUATE MODEL ACCURACY FOR TRAIN SET BY LABEL

zeros = predict_train.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros)))

ones = predict_train.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones)))

twos = predict_train.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos)))

training accuracy for label 0 (safe): 0.9979899497487437
training accuracy for label 1 (unsafe): 0.9751552795031055
training accuracy for label 2 (vandal): 1.0


In [66]:
# EVALUATE MODEL ACCURACY FOR TEST SET BY LABEL

zeros_test = predict_test.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test)))

ones_test = predict_test.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test)))

twos_test = predict_test.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test)))

training accuracy for label 0 (safe): 0.9073170731707317
training accuracy for label 1 (unsafe): 0.3194444444444444
training accuracy for label 2 (vandal): 0.0


In [None]:
# STATISTICS ON TRAINING DATA

#trainingSummary = lrModel.summary

# for multiclass, we can inspect metrics on a per-label basis
#print("False positive rate by label:")
#for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#    print("label %d: %s" % (i, rate))

#print("True positive rate by label:")
#for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#    print("label %d: %s" % (i, rate))

#print("Precision by label:")
#for i, prec in enumerate(trainingSummary.precisionByLabel):
#    print("label %d: %s" % (i, prec))

#print("Recall by label:")
#for i, rec in enumerate(trainingSummary.recallByLabel):
#    print("label %d: %s" % (i, rec))

In [67]:
# LOGISTIC REGRESSION WITH WEIGHTS -- CALCULATING WEIGHTS

dataset_tr_size = float(dataset_tr.select("label").count())

numZeros = dataset_tr.select("label").where('label == 0').count()
perZeros = (float(numZeros)/float(dataset_tr_size))

numOnes = dataset_tr.select("label").where('label == 1').count()
perOnes = (float(numOnes)/float(dataset_tr_size))

perTwos = 1-perOnes-perZeros

print('The percentage of 0s is {}'.format(perZeros))
print('The percentage of 1s is {}'.format(perOnes))
print('The percentage of 2s is {}'.format(perTwos))

weight0 = 1/(perZeros)
weight1 = 1/(perOnes)
weight2 = 1/(perTwos)

print('The weight of class safe is {}'.format(weight0))
print('The weight of class unsafe is {}'.format(weight1))
print('The weight of class vandal is {}'.format(weight2))

The percentage of 0s is 0.8468085106382979
The percentage of 1s is 0.13702127659574467
The percentage of 2s is 0.016170212765957426
The weight of class safe is 1.1809045226130652
The weight of class unsafe is 7.2981366459627335
The weight of class vandal is 61.842105263157976


In [68]:
# LOGISTIC REGRESSION WITH WEIGHTS -- ADDING WEIGHT COLUMN

dataset_tr = dataset_tr.withColumn("classWeights", F.when(dataset_tr.label == 0, weight0).when(dataset_tr.label == 1, weight1).otherwise(weight2))
dataset_tr.select("label","classWeights").show()

+-----+------------------+
|label|      classWeights|
+-----+------------------+
|  2.0|61.842105263157976|
|  2.0|61.842105263157976|
|  2.0|61.842105263157976|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  1.0|7.2981366459627335|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  0.0|1.1809045226130652|
|  1.0|7.2981366459627335|
|  0.0|1.1809045226130652|
+-----+------------------+
only showing top 20 rows



In [69]:
# LOGISTIC REGRESSION WITH WEIGHTS -- TRAINING AND PREDICTING

lr_w = LogisticRegression(labelCol="label", featuresCol="features", weightCol="classWeights", maxIter=20)
lrModel_w = lr_w.fit(dataset_tr)

predict_train_w = lrModel_w.transform(dataset_tr)
predict_test_w = lrModel_w.transform(dataset_test)


In [70]:
predict_train_w.select("difference","label_string","probability","label","prediction").show()
predict_test_w.select("difference","label_string","probability","label","prediction").show()

+--------------------+------------+--------------------+-----+----------+
|          difference|label_string|         probability|label|prediction|
+--------------------+------------+--------------------+-----+----------+
|aFollow me o tikt...|      vandal|[5.63891087403971...|  2.0|       2.0|
|       he is amazing|      vandal|[4.43415628387741...|  2.0|       2.0|
|. and it's so stu...|      vandal|[1.80586963405488...|  2.0|       2.0|
|       Augustinians||        safe|[0.99999999999998...|  0.0|       0.0|
|                   s|        safe|[0.99999999999999...|  0.0|       0.0|
|                  39|      unsafe|[7.02211183997095...|  1.0|       1.0|
|English-language ...|        safe|[0.99999999988305...|  0.0|       0.0|
|| last = Coogan |...|        safe|[0.99999952591428...|  0.0|       0.0|
|                   t|        safe|[0.99999999997636...|  0.0|       0.0|
|            (2009)y'|        safe|[0.99999999999999...|  0.0|       0.0|
|0.27<ref>{{cite w...|        safe|[0.

In [71]:
# SAVE LOGISTIC REGRESSION WITH WEIGHTS

#outpath = 'models/without_comment/logistic_regression_with_weights'
#outpath = 'models/with_comment/logistic_regression_with_weights'

#lrModel_w.write().overwrite().save(outpath)

In [None]:
# LOAD LOGISTIC REGRESSION WITH WEIGHTS

#from pyspark.ml.classification import LogisticRegressionModel

#LogisticRegressionModel.load('models/without_comment/logistic_regression_with_weights')
#LogisticRegressionModel.load('models/with_comment/logistic_regression_with_weights')

In [72]:
# EVALUATE MODEL ACCURACY FOR TRAIN AND TEST SETS

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train_w)))
print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test_w)))

overall accuracy train set: 0.9948936170212765
overall accuracy test set: 0.826530612244898


In [73]:
# EVALUATE MODEL ACCURACY FOR TRAIN SET BY LABEL

zeros_w = predict_train_w.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_w)))

ones_w = predict_train_w.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_w)))

twos_w = predict_train_w.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_w)))

training accuracy for label 0 (safe): 0.9959798994974874
training accuracy for label 1 (unsafe): 0.9875776397515528
training accuracy for label 2 (vandal): 1.0


In [74]:
# EVALUATE MODEL ACCURACY FOR TEST SET BY LABEL

zeros_test_w = predict_test_w.where("label == 0").select("label", "prediction")
print("testing accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test_w)))

ones_test_w = predict_test_w.where("label == 1").select("label", "prediction")
print("testing accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test_w)))

twos_test_w = predict_test_w.where("label == 2").select("label", "prediction")
print("testing accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test_w)))

testing accuracy for label 0 (safe): 0.9292682926829269
testing accuracy for label 1 (unsafe): 0.3333333333333333
testing accuracy for label 2 (vandal): 0.0


In [None]:
# K-FOLD CROSS-VALIDATION

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# model to cross-validate
lr_cv = LogisticRegression(maxIter=20)

# create ParamGrid for cross-validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr_cv.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr_cv.elasticNetParam, [0.0, 0.5, 1.0]) # Elastic Net Parameter
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label")

# create 5-fold CrossValidator
cv = CrossValidator(estimator=lr_cv, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

# fit the model
cvModel = cv.fit(dataset_tr)

# predict train and test datasets
predict_train_cv = cvModel.transform(dataset_tr)
predict_test_cv = cvModel.transform(dataset_test)

# show predictions
predict_train_cv.select("difference", "probability", "label", "prediction").show()
predict_test_cv.select("difference", "probability", "label", "prediction").show()

In [None]:
# GET THE PARAMETER VALUES OF THE BEST MODEL

best_model = cvModel.bestModel
print ('Best Param (regParam): ', best_model._java_obj.getRegParam())
print ('Best Param (elasticNetParam): ', best_model._java_obj.getElasticNetParam())

In [None]:
# SAVE CV MODEL WITHOUT WEIGHTS

#outpath = 'models/without_comment/logistic_regression_cv'
#outpath = 'models/with_comment/logistic_regression_cv'

#cvModel.write().overwrite().save(outpath)

In [None]:
# LOAD CV MODEL WITHOUT WEIGHTS

#CrossValidator.load('models/without_comment/logistic_regression_cv')
#CrossValidator.load('models/with_comment/logistic_regression_cv')

In [None]:
# EVALUATE OVERALL CV MODEL ACCURACY FOR TRAIN AND TEST SETS

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train_cv)))
print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test_cv)))

In [None]:
# EVALUATE CV MODEL ACCURACY FOR TRAIN SET BY LABEL

zeros_cv = predict_train_cv.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_cv)))

ones_cv = predict_train_cv.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_cv)))

twos_cv = predict_train_cv.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_cv)))

In [None]:
# EVALUATE CV MODEL ACCURACY FOR TEST SET BY LABEL

zeros_test_cv = predict_test_cv.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test_cv)))

ones_test_cv = predict_test_cv.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test_cv)))

twos_test_cv = predict_test_cv.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test_cv)))

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## THE FOLLOWING PART IS FOR TRAINING THE BEST MODEL ON THE WHOLE DATASET
##
#######################################################################################################

In [None]:
# TRAIN THE BEST MODEL -- WEIGHTED LOGISTIC REGRESSION WITH COMBINED 'COMMENT' AND 'DIFFERENCE' COLUMNS AS FEATURES

In [83]:
# LOGISTIC REGRESSION WITH WEIGHTS -- CALCULATING WEIGHTS

dataset_tr_size = float(dataset_tr.select("label").count())

numZeros = dataset_tr.select("label").where('label == 0').count()
perZeros = (float(numZeros)/float(dataset_tr_size))

numOnes = dataset_tr.select("label").where('label == 1').count()
perOnes = (float(numOnes)/float(dataset_tr_size))

perTwos = 1-perOnes-perZeros

print('The percentage of 0s is {}'.format(perZeros))
print('The percentage of 1s is {}'.format(perOnes))
print('The percentage of 2s is {}'.format(perTwos))

weight0 = 1/(perZeros)
weight1 = 1/(perOnes)
weight2 = 1/(perTwos)

print('The weight of class safe is {}'.format(weight0))
print('The weight of class unsafe is {}'.format(weight1))
print('The weight of class vandal is {}'.format(weight2))

The percentage of 0s is 0.8438438438438438
The percentage of 1s is 0.13993993993993994
The percentage of 2s is 0.016216216216216273
The weight of class safe is 1.1850533807829182
The weight of class unsafe is 7.145922746781116
The weight of class vandal is 61.66666666666645


In [84]:
# LOGISTIC REGRESSION WITH WEIGHTS -- ADDING WEIGHT COLUMN

dataset_tr = dataset_tr.withColumn("classWeights", F.when(dataset_tr.label == 0, weight0).when(dataset_tr.label == 1, weight1).otherwise(weight2))
dataset_tr.select("label","classWeights").show()

+-----+------------------+
|label|      classWeights|
+-----+------------------+
|  2.0| 61.66666666666645|
|  2.0| 61.66666666666645|
|  2.0| 61.66666666666645|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  1.0| 7.145922746781116|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  0.0|1.1850533807829182|
|  1.0| 7.145922746781116|
+-----+------------------+
only showing top 20 rows



In [86]:
# LOGISTIC REGRESSION WITH WEIGHTS -- TRAINING AND PREDICTING

# train
lr_w = LogisticRegression(labelCol="label", featuresCol="features", weightCol="classWeights", maxIter=20)
lrModel_w = lr_w.fit(dataset_tr)

# predict
predict_train_w = lrModel_w.transform(dataset_tr)

In [87]:
# add predicted labels as strings (since now predictions are done only in terms of 0, 1 and 2)
predictions_string = predict_train_w.withColumn("prediction_string", F.when(predict_train_w.prediction == 0, "safe").when(predict_train_w.prediction == 1, "unsafe").otherwise("vandal"))
predictions_string.select("comment", "difference", "probability", "label", "label_string" ,"prediction", "prediction_string").show()

+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|             comment|          difference|         probability|label|label_string|prediction|prediction_string|
+--------------------+--------------------+--------------------+-----+------------+----------+-----------------+
|                    |aFollow me o tikt...|[9.49730303155441...|  2.0|      vandal|       2.0|           vandal|
|      (→‎Early life)|       he is amazing|[6.55907802473438...|  2.0|      vandal|       2.0|           vandal|
|         (→‎History)|. and it's so stu...|[3.60613953939795...|  2.0|      vandal|       2.0|           vandal|
|→‎Preseason All-S...|                 Jr.|[1.0,2.0271336158...|  0.0|        safe|       0.0|             safe|
|→‎Churches:Fixing...|       Augustinians||[1.0,1.9652482166...|  0.0|        safe|       0.0|             safe|
|→‎Churches holdin...|                   s|[1.0,3.1346578602...|  0.0|        safe|       0.0|  

In [91]:
# SAVE THE BEST MODEL

outpath = 'models/best/logistic_regression_with_weights_and_comment'

lrModel_w.write().overwrite().save(outpath)

In [89]:
# EVALUATE MODEL ACCURACY

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

print("overall accuracy: %s" % (evaluator.evaluate(predict_train_w)))

overall accuracy: 0.9921921921921922


In [90]:
# EVALUATE MODEL ACCURACY BY LABEL

zeros_w = predict_train_w.where("label == 0").select("label", "prediction")
print("accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_w)))

ones_w = predict_train_w.where("label == 1").select("label", "prediction")
print("accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_w)))

twos_w = predict_train_w.where("label == 2").select("label", "prediction")
print("accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_w)))

accuracy for label 0 (safe): 0.994306049822064
accuracy for label 1 (unsafe): 0.9785407725321889
accuracy for label 2 (vandal): 1.0
