In [27]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [28]:
sc

In [29]:
# LOAD SOME RELEVANT LIBRARIES -- OTHERS WILL BE LOADED WHEN NEEDED

from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [30]:
# LOOP THROUGH ALL FILES

import os
rootdir = 'C:/Pr/AA_Big_Data/Assignment_3/spark/save_30'

files_list = []

for subdir, dirs, files in os.walk(rootdir):
    for name in files:
        if "part" in name.lower() and not ".crc" in name.lower():
            files_list.append(os.path.join(subdir,name))

In [31]:
# PUT ALL FILES TO ONE JSON DATAFRAME

df1 = spark.read.json(sc.textFile(','.join(files_list)))
df1.show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                prod|  safe|            Reywas92|{{Proposed deleti...|{{geoGroupTemplat...|List of cemeterie...|//en.wikipedia.or...|
|→‎New Zealand:upd...|  safe|            Flags220|{{short descripti...|{{short descripti...|Sequoiadendron gi...|//en.wikipedia.or...|
|→‎Landmarks:added...|  safe|         Engrksaim29|{{Use British Eng...|{{Use British Eng...|Barnala, Azad Kas...|//en.wikipedia.or...|
|                prod|  safe|            Reywas92|{{Proposed deleti...|{{geoGroupTemplat...|List of cemeterie...|//en.wikipedia.or...|
|small changes mad...|  safe|   Priti Rao Krishna|[[Fil

In [32]:
# SHOW SCHEMA OF THE DATASET

df1.printSchema()

root
 |-- comment: string (nullable = true)
 |-- label: string (nullable = true)
 |-- name_user: string (nullable = true)
 |-- text_new: string (nullable = true)
 |-- text_old: string (nullable = true)
 |-- title_page: string (nullable = true)
 |-- url_page: string (nullable = true)



In [33]:
# DISTRIBUTION OF LABELS IN THE DATASET

from pyspark.sql.functions import col
df1.groupBy("label") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+------+-----+
| label|count|
+------+-----+
|  safe|   61|
|vandal|   11|
|unsafe|    7|
+------+-----+



In [34]:
# DIFFERENCE BETWEEN OLD AND NEW TEXT

import difflib
import pyspark.sql.functions as F
from pyspark.sql.types import *

def make_diff(old, new):
    diff = difflib.ndiff(old, new)
    delta = ''.join(x[2:] for x in diff if x.startswith('- ') or x.startswith('+'))
    return delta

#convert to a UDF Function and get difference between columns
udfmake_diff = F.udf(make_diff, StringType())
df_difference1 = df1.withColumn("difference", lit(udfmake_diff("text_old", "text_new")))
df_difference1.show()

+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             comment| label|           name_user|            text_new|            text_old|          title_page|            url_page|          difference|
+--------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                prod|  safe|            Reywas92|{{Proposed deleti...|{{geoGroupTemplat...|List of cemeterie...|//en.wikipedia.or...|{{Proposed deleti...|
|→‎New Zealand:upd...|  safe|            Flags220|{{short descripti...|{{short descripti...|Sequoiadendron gi...|//en.wikipedia.or...|                2530|
|→‎Landmarks:added...|  safe|         Engrksaim29|{{Use British Eng...|{{Use British Eng...|Barnala, Azad Kas...|//en.wikipedia.or...|in bitown centre ...|
|                prod|  safe|            Reywas92|{{Proposed del

In [35]:
# CHANGE COLUMN NAME FROM 'LABEL' TO 'LABEL_STRING' -- LATER WE WILL CONVERT LABEL_STRING TO NUMERICAL VALUES AND GIVE THE COLUMN NAME 'LABEL'

df_wd1 = df_difference1.withColumnRenamed('label', 'label_string')
#df_wd.show()

In [36]:
# EITHER RUN THIS CELL OR THE NEXT CELL (BUT NOT BOTH)
# THIS IS RUN ONLY TO TEST MODEL ACCURACY AND TUNE THE MODEL -- SKIP TO TRAIN MODEL ON THE WHOLE DATASET

# split dataset to train and test sets

(trainingData1, testData1) = df_wd1.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData1.count()))
print("Test Dataset Count: " + str(testData1.count()))


print("Distribution of labels in train set:")
trainingData1.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

print("Distribution of labels in test set:")
testData1.groupBy("label_string") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


Training Dataset Count: 59
Test Dataset Count: 20
Distribution of labels in train set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|   46|
|      vandal|    8|
|      unsafe|    5|
+------------+-----+

Distribution of labels in test set:
+------------+-----+
|label_string|count|
+------------+-----+
|        safe|   15|
|      vandal|    3|
|      unsafe|    2|
+------------+-----+



In [15]:
# TOKENIZING 'DIFFERENCE' COLUMN AND REMOVING CERTAIN STOP WORDS

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

# tokenize 'difference' column
regexTokenizer1 = RegexTokenizer(inputCol="difference", outputCol="words", pattern="\\W")
regexTokenizer_comment1 = RegexTokenizer(inputCol="comment", outputCol="words_comment", pattern="\\W")

# remove stop words
stop_words1 = ["http","https", "a", "an", "the", "about", "above", "after", "again", "against", "all", "am", "and", "any", "are", "aren't", "as", "at", "be", "because",
              "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", 
              "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll",
              "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is",
              "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only",
              "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't",
              "so", "some", "such", "than", "that", "that's", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll",
              "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've",
              "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't",
              "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves"] 
stopwordsRemover1 = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stop_words1)
stopwordsRemover_comment1 = StopWordsRemover(inputCol="words_comment", outputCol="filtered_comment").setStopWords(stop_words1)

In [16]:
# CONVERT 'LABEL_STRING' TO NUMERIC COLUMN 'LABEL'

from pyspark.ml.feature import OneHotEncoder, StringIndexer

# index categorical variable
label_stringIdx1 = StringIndexer(inputCol = "label_string", outputCol = "label")

In [17]:
# TF-IDF AND PIPELINE -- WITHOUT 'COMMENT'

from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# do TF-IDF embeding
hashingTF1 = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf1 = IDF(inputCol="rawFeatures", outputCol="features")

# define the pipeline
pipeline1 = Pipeline(stages=[regexTokenizer1, stopwordsRemover1, hashingTF1, idf1, label_stringIdx1])

In [37]:
# FIT PIPELINE ON TRAIN DATASET
pipelineFit1 = pipeline1.fit(trainingData1)

In [38]:
# USE PIPELINE ON TRAIN DATASET

dataset_tr1 = pipelineFit1.transform(trainingData1)

# without 'comment':
#dataset_tr.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
#dataset_tr.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
#dataset_tr.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
#dataset_tr.select("features_diff", "features_comment", "features", "label").show()

In [39]:
# USE PIPELINE ON TEST DATASET (ONLY DO IF THE SPLIT BETWEEN TRAIN AND TEST SETS WAS PERFORMED)

dataset_test1 = pipelineFit1.transform(testData1)

# without 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features", "label").show()

# with 'comment':
#dataset_test.select("difference","words","filtered","rawFeatures","features_diff", "label").show()
#dataset_test.select("comment","words_comment","filtered_comment","rawFeatures_comment","features_comment", "label").show()
#dataset_test.select("features_diff", "features_comment", "features", "label").show()

In [40]:
# WE WILL USE LOGISTIC REGRESSION FOR PREDICTING

from pyspark.ml.classification import LogisticRegression

In [41]:
# LOGISTIC REGRESSION WITHOUT WEIGHTS

# fit logistic regression
lr1 = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel1 = lr1.fit(dataset_tr1)


# predict train and test sets
predict_train1 = lrModel1.transform(dataset_tr1)
predict_test1 = lrModel1.transform(dataset_test1)

#comment if not used
predict_train1.select("comment", "difference","label_string","probability","label","prediction").show()
predict_test1.select("comment", "difference","label_string","probability","label","prediction").show()

+--------------------+--------------------+------------+--------------------+-----+----------+
|             comment|          difference|label_string|         probability|label|prediction|
+--------------------+--------------------+------------+--------------------+-----+----------+
|                prod|{{Proposed deleti...|        safe|[0.78149239173343...|  0.0|       0.0|
|→‎New Zealand:upd...|                2530|        safe|[0.78149239173343...|  0.0|       0.0|
|→‎Landmarks:added...|in bitown centre ...|        safe|[0.78149239173343...|  0.0|       0.0|
|small changes mad...|played lead in TV...|        safe|[0.78149239173343...|  0.0|       0.0|
|Update place of b...|                  , |        safe|[0.78149239173343...|  0.0|       0.0|
|bypass redirect, ...| name="bag.admin....|        safe|[0.78149239173343...|  0.0|       0.0|
|                    |]]
* [[Second Bat...|        safe|[0.78149239173343...|  0.0|       0.0|
|     (Added content)|The Davidians wer...|      u