In [13]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [14]:
sc

In [15]:
# LOAD SOME RELEVANT LIBRARIES -- OTHERS WILL BE LOADED WHEN NEEDED

from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [16]:
# LOOP THROUGH ALL FILES

import os
rootdir = 'C:/Pr/AA_Big_Data/Assignment_3/spark/save_2k'

files_list = []

for subdir, dirs, files in os.walk(rootdir):
    for name in files:
        if "part" in name.lower() and not ".crc" in name.lower():
            files_list.append(os.path.join(subdir,name))

In [17]:
# PUT ALL FILES TO ONE JSON DATAFRAME

df = spark.read.json(sc.textFile(','.join(files_list)))
df.show()

+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|        name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+
|→‎Preseason All-S...|  safe|    Debartolo2917|{{short descripti...|{{short descripti...|2019 Kentucky Wil...|//en.wikipedia.or...|
|→‎Churches:Fixing...|  safe|           BD2412|{{short descripti...|{{short descripti...|List of churches ...|//en.wikipedia.or...|
|→‎Churches holdin...|  safe|           BD2412|[[File:StPaulsCat...|[[File:StPaulsCat...|List of churches ...|//en.wikipedia.or...|
|            →‎Roster|  safe|    Debartolo2917|{{short descripti...|{{short descripti...|2018 Kentucky Wil...|//en.wikipedia.or...|
|             grammar|  safe|          Znagy88|{{Infobox settlem...|{{Infobo

In [18]:
# SHOW SCHEMA OF THE DATASET

df.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)



In [19]:
# DISTRIBUTION OF LABELS IN THE DATASET

from pyspark.sql.functions import col
df.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
| label|count|
+------+-----+
|  safe| 1405|
|unsafe|  233|
|vandal|   27|
+------+-----+



In [22]:
# TO SAMPLE FROM DATAFRAME (USE SAMPLEBY TO SPECIFY SAMPLED FRACTIONS OF EACH CLASS)
#import pyspark.sql
#df_sample = df.sample(False, fraction = 0.1, seed = 100)
#df_sample.show()

+--------------------+------+-------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|    name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+-------------+--------------------+--------------------+--------------------+--------------------+
|→‎Parodies and co...|  safe| InedibleHulk|{{short descripti...|{{short descripti...|Smells Like Teen ...|//en.wikipedia.or...|
|   →‎Literary career|  safe|     Coingeek|{{Infobox person
...|{{Infobox person
...|     Q. David Bowers|//en.wikipedia.or...|
|            (Added.)|unsafe|AndersonL7333|{{unreferenced|da...|{{unreferenced|da...|National Front (I...|//en.wikipedia.or...|
|→‎top:added short...|  safe|  Lepricavark|{{short descripti...|{{Infobox basebal...|          Amos Cross|//en.wikipedia.or...|
|                    |unsafe|12.235.76.177|{{For|other battl...|{{For|other battl...|Battle of San Jac..

In [20]:
# DIFFERENCE BETWEEN OLD AND NEW TEXT

import difflib
import pyspark.sql.functions as F
from pyspark.sql.types import *

def make_diff(old, new):
    diff = difflib.ndiff(old, new)
    delta = ''.join(x[2:] for x in diff if x.startswith('- ') or x.startswith('+'))
    return delta

#convert to a UDF Function and get difference between columns
udfmake_diff = F.udf(make_diff, StringType())
df_difference = df.withColumn("difference", lit(udfmake_diff("text_old", "text_new")))
df_difference.show()

+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|        name_user|            text_new|            text_old|          title_page|            url_page|          difference|
+--------------------+------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|→‎Preseason All-S...|  safe|    Debartolo2917|{{short descripti...|{{short descripti...|2019 Kentucky Wil...|//en.wikipedia.or...|                 Jr.|
|→‎Churches:Fixing...|  safe|           BD2412|{{short descripti...|{{short descripti...|List of churches ...|//en.wikipedia.or...|       Augustinians||
|→‎Churches holdin...|  safe|           BD2412|[[File:StPaulsCat...|[[File:StPaulsCat...|List of churches ...|//en.wikipedia.or...|                   s|
|            →‎Roster|  safe|    Debartolo2917|{{short descripti...|{{short descri

In [21]:
# CHANGE COLUMN NAME FROM 'LABEL' TO 'LABEL_STRING' -- LATER WE WILL CONVERT LABEL_STRING TO NUMERICAL VALUES AND GIVE THE COLUMN NAME 'LABEL'

df_wd = df_difference.withColumnRenamed('label', 'label_string')
#df_wd.show()

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## CHOOSE TO RUN ONLY ONE OF THE FOLLOWING TWO CELLS
## DO NOT RUN THEM BOTH
##
##
#######################################################################################################

In [22]:
# EITHER RUN THIS CELL OR THE NEXT CELL (BUT NOT BOTH)
# THIS IS RUN ONLY TO TEST MODEL ACCURACY AND TUNE THE MODEL -- SKIP TO TRAIN MODEL ON THE WHOLE DATASET

# split dataset to train and test sets

(trainingData, testData) = df_wd.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))


print("Distribution of labels in train set:")
trainingData.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

print("Distribution of labels in test set:")
testData.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


Training Dataset Count: 1175
Test Dataset Count: 490
Distribution of labels in train set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|  996|
|      unsafe|  163|
|      vandal|   16|
+------------+-----+

Distribution of labels in test set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|  409|
|      unsafe|   70|
|      vandal|   11|
+------------+-----+



In [25]:
# EITHER RUN THIS CELL OR THE CELL ABOVE (BUT NOT BOTH)
# THIS IS RUN TO TRAIN SELECTED MODEL ON THE ENTIRE DATASET

trainingData = df_wd

In [None]:
# IMPUTE NaN'S INSTEAD OF EMPTY STRINGS

#def processMissingCategory(s):
#    if s == "":
#        return "NaN"
#    else:
#        return s

#udfprocessMissingCategory = F.udf(processMissingCategory, StringType())
#df_nona = df_difference.withColumn('comment', lit(udfprocessMissingCategory('comment')))
#df_nona.show()

In [23]:
# TOKENIZING 'DIFFERENCE' COLUMN AND REMOVING CERTAIN STOP WORDS

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

# tokenize 'difference' column
regexTokenizer = RegexTokenizer(inputCol="difference", outputCol="words", pattern="\\W")
regexTokenizer_comment = RegexTokenizer(inputCol="comment", outputCol="words_comment", pattern="\\W")

# remove stop words
stop_words = ["http","https", "a", "an", "the", "about", "above", "after", "again", "against", "all", "am", "and", "any", "are", "aren't", "as", "at", "be", "because",
              "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", 
              "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll",
              "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is",
              "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only",
              "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't",
              "so", "some", "such", "than", "that", "that's", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've",
              "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't",
              "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words)
stopwordsRemover_comment = StopWordsRemover(inputCol="words_comment", outputCol="filtered_comment").setStopWords(stop_words)

In [24]:
# CONVERT 'LABEL_STRING' TO NUMERIC COLUMN 'LABEL'

from pyspark.ml.feature import OneHotEncoder, StringIndexer

# index categorical variable
label_stringIdx = StringIndexer(inputCol = "label_string", outputCol = "label")

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## CHOOSE TO RUN ONLY ONE OF THE FOLLOWING TWO CELLS
## DO NOT RUN THEM BOTH
##
##
#######################################################################################################

In [25]:
# TF-IDF AND PIPELINE -- WITHOUT 'COMMENT'

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# do TF-IDF embeding
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

# define the pipeline
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [65]:
# TF-IDF AND PIPELINE -- WITH 'COMMENT'

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# do TF-IDF embeding
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features_diff")

hashingTF_comment = HashingTF(inputCol="filtered_comment", outputCol="rawFeatures_comment", numFeatures=10000)
idf_comment = IDF(inputCol="rawFeatures_comment", outputCol="features_comment")

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["features_comment", "features_diff"], outputCol="features")

# define the pipeline
pipeline = Pipeline(stages=[regexTokenizer, regexTokenizer_comment, stopwordsRemover, stopwordsRemover_comment, hashingTF, hashingTF_comment, idf, idf_comment, 
                            assembler, label_stringIdx])

In [26]:
# FIT PIPELINE ON TRAIN DATASET

pipelineFit = pipeline.fit(trainingData)

In [28]:
# SAVE PIPELINE

#outpath = 'C:/Pr/AA_Big_Data/Assignment_3/spark/models/without_comment/tfidf'
#outpath = 'C:/Users/Aistuxxe/Documents/spark/models/with_comment/tfidf'
#pipelineFit.write().overwrite().save(outpath)

In [27]:
# USE PIPELINE ON TRAIN DATASET

dataset_tr = pipelineFit.transform(trainingData)

# without 'comment':
#dataset_tr.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
#dataset_tr.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
#dataset_tr.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
#dataset_tr.select("features_diff", "features_comment", "features", "label").show()

In [29]:
# USE PIPELINE ON TEST DATASET (ONLY DO IF THE SPLIT BETWEEN TRAIN AND TEST SETS WAS PERFORMED)

dataset_test = pipelineFit.transform(testData)

# without 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
#dataset_test.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
#dataset_test.select("features_diff", "features_comment", "features", "label").show()

In [30]:
# WE WILL USE LOGISTIC REGRESSION FOR PREDICTING

from pyspark.ml.classification import LogisticRegression

In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## THE FOLLOWING PART IS FOR TESTING MODEL ACCURACY, MODEL TUNING AND DECIDING WHICH MODEL TO USE
## USE IT ONLY IF YOU SPLIT INITIAL DATASET TO TRAIN AND TEST SETS
## IN ORDER TO USE THE WHOLE DATASET AND TRAIN THE FINAL MODEL, SKIP UNTIL THE NEXT NOTICE
##
#######################################################################################################

In [32]:
# LOGISTIC REGRESSION WITHOUT WEIGHTS

# fit logistic regression
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20)
lrModel = lr.fit(dataset_tr)


# predict train and test sets
predict_train = lrModel.transform(dataset_tr)
predict_test = lrModel.transform(dataset_test)

#comment if not used
predict_train.select("comment", "difference","label_string","probability","label","prediction").show()
predict_test.select("comment", "difference","label_string","probability","label","prediction").show()

+--------------------+--------------------+------------+--------------------+-----+----------+
|             comment|          difference|label_string|         probability|label|prediction|
+--------------------+--------------------+------------+--------------------+-----+----------+
|→‎Preseason All-S...|                 Jr.|        safe|[0.98796644742399...|  0.0|       0.0|
|→‎Churches:Fixing...|       Augustinians||        safe|[0.99981918066159...|  0.0|       0.0|
|→‎Churches holdin...|                   s|        safe|[0.96516371365320...|  0.0|       0.0|
|             grammar|must s probably b...|        safe|[0.99999989444544...|  0.0|       0.0|
|                    |                  39|      unsafe|[0.81871781744510...|  1.0|       0.0|
|Fixed citation an...|                   t|        safe|[0.97667989166664...|  0.0|       0.0|
|→‎Appearances in ...|            (2009)y'|        safe|[0.96484976127255...|  0.0|       0.0|
|  →‎Season 17 (2020)|0.27<ref>{{cite w...|       

In [None]:
# SAVE LOGISTIC REGRESSION WITHOUT WEIGHTS

outpath = 'C:/Pr/AA_Big_Data/Assignment_3/spark/models/without_comment/logistic_regression'
#outpath = 'C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression'

lrModel.write().overwrite().save(outpath)

In [None]:
# LOAD LOGISTIC REGRESSION WITHOUT WEIGHTS

#from pyspark.ml.classification import LogisticRegressionModel

#LogisticRegressionModel.load('C:/Users/Aistuxxe/Documents/spark/models/without_comment/logistic_regression')
#LogisticRegressionModel.load('C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression')

In [33]:
# EVALUATE OVERALL MODEL ACCURACY FOR TRAIN AND TEST SETS

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train)))
print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test)))

overall accuracy train set: 0.9846808510638297
overall accuracy test set: 0.8122448979591836


In [34]:
# EVALUATE MODEL ACCURACY FOR TRAIN SET BY LABEL

zeros = predict_train.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros)))

ones = predict_train.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones)))

twos = predict_train.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos)))

training accuracy for label 0 (safe): 0.9949799196787149
training accuracy for label 1 (unsafe): 0.9202453987730062
training accuracy for label 2 (vandal): 1.0


In [35]:
# EVALUATE MODEL ACCURACY FOR TEST SET BY LABEL

zeros_test = predict_test.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test)))

ones_test = predict_test.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test)))

twos_test = predict_test.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test)))

training accuracy for label 0 (safe): 0.9266503667481663
training accuracy for label 1 (unsafe): 0.2714285714285714
training accuracy for label 2 (vandal): 0.0


In [75]:
# STATISTICS ON TRAINING DATA

#trainingSummary = lrModel.summary

# for multiclass, we can inspect metrics on a per-label basis
#print("False positive rate by label:")
#for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#    print("label %d: %s" % (i, rate))

#print("True positive rate by label:")
#for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#    print("label %d: %s" % (i, rate))

#print("Precision by label:")
#for i, prec in enumerate(trainingSummary.precisionByLabel):
#    print("label %d: %s" % (i, prec))

#print("Recall by label:")
#for i, rec in enumerate(trainingSummary.recallByLabel):
#    print("label %d: %s" % (i, rec))


False positive rate by label:
label 0: 0.0
label 1: 0.0
label 2: 0.0
True positive rate by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0


In [110]:
# K-FOLD CROSS-VALIDATION

#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#lr_cv = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.5)

# Create ParamGrid for Cross Validation
#paramGrid = (ParamGridBuilder()
#             .addGrid(lr_cv.regParam, [0.1, 0.3, 0.5]) # regularization parameter
#             .addGrid(lr_cv.elasticNetParam, [0.0, 0.5, 1.0]) # Elastic Net Parameter (Ridge = 0)
#             .build())

#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

# Create 5-fold CrossValidator
#cv = CrossValidator(estimator=lr_cv, \
#                    estimatorParamMaps=paramGrid, \
#                    evaluator=evaluator, \
#                    numFolds=5)
#cvModel = cv.fit(dataset_tr)

#predict_train_cv = cvModel.transform(dataset_tr)
#predict_test_cv = cvModel.transform(dataset_test)

In [118]:
#best = cvModel.bestModel

In [120]:
#print ('Best Param (regParam): ', best._java_obj.getRegParam())

Best Param (regParam):  0.1


In [122]:
#print ('Best Param (elasticNetParam): ', best._java_obj.getElasticNetParam())

Best Param (elasticNetParam):  0.5


In [None]:
# SAVE CV MODEL WITHOUT WEIGHTS

#outpath = 'C:/Users/Aistuxxe/Documents/spark/models/without_comment/logistic_regression_cv'
#outpath = 'C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression_cv'

#cvModel.write().overwrite().save(outpath)

In [None]:
# LOAD CV MODEL WITHOUT WEIGHTS

#CrossValidator.load('C:/Users/Aistuxxe/Documents/spark/models/without_comment/logistic_regression_cv')
#CrossValidator.load('C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression_cv')

In [None]:
# EVALUATE OVERALL CV MODEL ACCURACY FOR TRAIN AND TEST SETS

#from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

#print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train_cv)))
#print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test_cv)))

In [None]:
# EVALUATE CV MODEL ACCURACY FOR TRAIN SET BY LABEL

#zeros_cv = predict_train_cv.where("label == 0").select("label", "prediction")
#print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_cv)))

#ones_cv = predict_train_cv.where("label == 1").select("label", "prediction")
#print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_cv)))

#twos_cv = predict_train_cv.where("label == 2").select("label", "prediction")
#print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_cv)))

In [None]:
# EVALUATE CV MODEL ACCURACY FOR TEST SET BY LABEL

#zeros_test_cv = predict_test_cv.where("label == 0").select("label", "prediction")
#print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test_cv)))

#ones_test_cv = predict_test_cv.where("label == 1").select("label", "prediction")
#print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test_cv)))

#twos_test_cv = predict_test_cv.where("label == 2").select("label", "prediction")
#print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test_cv)))

In [36]:
# LOGISTIC REGRESSION WITH WEIGHTS -- CALCULATING WEIGHTS

dataset_tr_size = float(dataset_tr.select("label").count())

numZeros = dataset_tr.select("label").where('label == 0').count()
perZeros = (float(numZeros)/float(dataset_tr_size))

numOnes = dataset_tr.select("label").where('label == 1').count()
perOnes = (float(numOnes)/float(dataset_tr_size))

perTwos = 1-perOnes-perZeros

print('The percentage of 0s is {}'.format(perZeros))
print('The percentage of 1s is {}'.format(perOnes))
print('The percentage of 2s is {}'.format(perTwos))

weight0 = 1/(perZeros)
weight1 = 1/(perOnes)
weight2 = 1/(perTwos)

print('The weight of class safe is {}'.format(weight0))
print('The weight of class unsafe is {}'.format(weight1))
print('The weight of class vandal is {}'.format(weight2))

The percentage of 0s is 0.8476595744680852
The percentage of 1s is 0.13872340425531915
The percentage of 2s is 0.013617021276595698
The weight of class safe is 1.179718875502008
The weight of class unsafe is 7.208588957055214
The weight of class vandal is 73.43750000000026


In [None]:
# LOGISTIC REGRESSION WITH WEIGHTS -- CALCULATING WEIGHTS

#dataset_tr_size = float(dataset_tr.select("label").count())

#numZeros = dataset_tr.select("label").where('label == 0').count()
#perZeros = (float(numZeros)/float(dataset_tr_size))

#numOnes = dataset_tr.select("label").where('label == 1').count()
#perOnes = (float(numOnes)/float(dataset_tr_size))

#perTwos = 1-perOnes-perZeros

#print('The percentage of 0s is {}'.format(perZeros))
#print('The percentage of 1s is {}'.format(perOnes))
#print('The percentage of 2s is {}'.format(perTwos))

#weight0 = 1/(perZeros/perTwos)
#weight1 = 1/(perOnes/perTwos)
#weight2 = 1

#print('The weight of class safe is {}'.format(weight0))
#print('The weight of class unsafe is {}'.format(weight1))
#print('The weight of class vandal is {}'.format(weight2))

In [37]:
# LOGISTIC REGRESSION WITH WEIGHTS -- ADDING WEIGHT COLUMN

dataset_tr = dataset_tr.withColumn("classWeights", F.when(dataset_tr.label == 0, weight0).when(dataset_tr.label == 1, weight1).otherwise(weight2))
dataset_tr.select("label","classWeights").show()

+-----+-----------------+
|label|     classWeights|
+-----+-----------------+
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  1.0|7.208588957055214|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  1.0|7.208588957055214|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
|  0.0|1.179718875502008|
+-----+-----------------+
only showing top 20 rows



In [38]:
# LOGISTIC REGRESSION WITH WEIGHTS -- TRAINING AND PREDICTING

lr_w = LogisticRegression(labelCol="label", featuresCol="features", weightCol="classWeights", maxIter=20)
lrModel_w = lr_w.fit(dataset_tr)

predict_train_w = lrModel_w.transform(dataset_tr)
predict_test_w = lrModel_w.transform(dataset_test)
#predict_train_w.select("difference","label_string","probability","label","prediction").show()
#predict_test_w.select("difference","label_string","probability","label","prediction").show()

In [45]:
# SAVE LOGISTIC REGRESSION WITH WEIGHTS

outpath = 'C:/Pr/AA_Big_Data/Assignment_3/spark/models/without_comment/logistic_regression_with_weights'
#outpath = 'C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression_with_weights'

lrModel_w.write().overwrite().save(outpath)

In [None]:
# LOAD LOGISTIC REGRESSION WITH WEIGHTS

#from pyspark.ml.classification import LogisticRegressionModel

#LogisticRegressionModel.load('C:/Users/Aistuxxe/Documents/spark/models/without_comment/logistic_regression_with_weights')
#LogisticRegressionModel.load('C:/Users/Aistuxxe/Documents/spark/models/with_comment/logistic_regression_with_weights')

In [40]:
# EVALUATE MODEL ACCURACY FOR TRAIN AND TEST SETS

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol = "label", metricName="accuracy")

#print("overall accuracy train set: %s" % (evaluator.evaluate(predict_train_w)))
#print("overall accuracy test set: %s" % (evaluator.evaluate(predict_test_w)))

In [42]:
# EVALUATE MODEL ACCURACY FOR TRAIN SET BY LABEL

zeros_w = predict_train_w.where("label == 0").select("label", "prediction")
print("training accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_w)))

ones_w = predict_train_w.where("label == 1").select("label", "prediction")
print("training accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_w)))

twos_w = predict_train_w.where("label == 2").select("label", "prediction")
print("training accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_w)))

training accuracy for label 0 (safe): 0.9899598393574297
training accuracy for label 1 (unsafe): 0.9570552147239264
training accuracy for label 2 (vandal): 1.0


In [44]:
# EVALUATE MODEL ACCURACY FOR TEST SET BY LABEL

zeros_test_w = predict_test_w.where("label == 0").select("label", "prediction")
print("testing accuracy for label 0 (safe): %s" % (evaluator.evaluate(zeros_test_w)))

ones_test_w = predict_test_w.where("label == 1").select("label", "prediction")
print("testing accuracy for label 1 (unsafe): %s" % (evaluator.evaluate(ones_test_w)))

twos_test_w = predict_test_w.where("label == 2").select("label", "prediction")
print("testing accuracy for label 2 (vandal): %s" % (evaluator.evaluate(twos_test_w)))

testing accuracy for label 0 (safe): 0.9535452322738386
testing accuracy for label 1 (unsafe): 0.24285714285714285
testing accuracy for label 2 (vandal): 0.0


In [None]:
######################################################################################################
##
## NOTE !!!!!!
##
## THE FOLLOWING PART IS FOR TRAINING THE BEST MODEL ON THE WHOLE DATASET
##
#######################################################################################################

In [None]:
# train best model

In [None]:
predictions_string = predict_train.withColumn("prediction_string", F.when(predict_train.label == 0, "safe").when(predict_train.label == 1, "unsafe").otherwise("vandal"))
predictions_string.select("comment", "difference", "label","prediction", "label_string", "predicted_string").show()

In [None]:
# save best model