In [42]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):

        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [43]:
sc

In [44]:
spark

# build model

In [47]:
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, HashingTF
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import expr
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.sql.functions import split, rand
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

file_location="reviews.csv"

In [8]:
# create a SparkSession 
spark = SparkSession.builder \
    .appName("assignment 3") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()

In [11]:
# read csv

text_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiline", "true") \
    .option("delimiter", "\t") \
    .csv(file_location)

text_df = text_df.select(col('review_text'), col('label'))
text_df.show()

+--------------------+-----+
|         review_text|label|
+--------------------+-----+
|i can confirm tha...|  1.0|
|Really good game,...|  1.0|
|Its not finished ...|  1.0|
|Hey. It's really ...|  1.0|
|          Fun so far|  1.0|
|I'd rather play W...|  0.0|
|I have been playi...|  1.0|
|Nice game! Loads ...|  1.0|
|All hail NA serve...|  1.0|
|enjoying it so fa...|  1.0|
|This game came as...|  1.0|
|         Great fun  |  1.0|
|Fantastic consept...|  1.0|
|You know the game...|  1.0|
|EDIT: My issues h...|  1.0|
|AMAZIIIIIIIIIIING...|  1.0|
|In its current st...|  0.0|
|Играю в Калибр го...|  1.0|
|Secret Word: Prou...|  1.0|
|Fated Word: Death...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [12]:
# Remove duplicate rows
text_df = text_df.dropDuplicates()
# Remove rows with missing values
text_df = text_df.na.drop()

In [13]:
text_df.select("label").distinct().show()

+--------------------+
|               label|
+--------------------+
|                 1.0|
|                 0.0|
|It's been at leas...|
|[td]✔️ Exciting p...|
|- I literally onl...|
+--------------------+



In [22]:
# Remove rows with unexpected labels
text_df = text_df.filter((col("label") == 1.0) | (col("label") == 0.0))
print("Number of rows after filtering: ", text_df.count())
print("Number of rows after filtering with 1 labelled: ", text_df.filter(col("label") == 1.0).count())
print("Number of rows after filtering with 0 labelled: ", text_df.filter(col("label") == 0.0).count())

Number of rows after filtering:  36913
Number of rows after filtering with 1 labelled:  29406
Number of rows after filtering with 0 labelled:  7507


In [29]:
# make balanced dataset
text_df_0 = text_df.filter(col("label") == 0.0)  
text_df_1 = text_df.filter(col("label") == 1.0) 
sampled_text_df_1 = text_df_1.orderBy(rand()).limit(text_df_0.count())
balance_text_df = text_df_0.union(sampled_text_df_1)
print("1 labelled: ", balance_text_df.filter(col("label") == 1.0).count())
print("0 labelled: ", balance_text_df.filter(col("label") == 0.0).count())
                        

1 labelled:  7507
0 labelled:  7507


In [32]:
# Split data set
train_data, test_data = balance_text_df.randomSplit([0.8, 0.2], seed=7)

3012

In [33]:
# preprocess the data
tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", locale="en_US")
count_vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
string_indexer = StringIndexer(inputCol="label", outputCol="label_index")

# create model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")

In [34]:
# define params grid
param_grid = ParamGridBuilder() \
   .addGrid(count_vectorizer.vocabSize, [1000, 5000]) \
   .addGrid(lr.regParam, [0.01, 0.1]) \
   .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
   .build()

# define the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction")

In [35]:
# create pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, count_vectorizer, idf, string_indexer, lr])

# define the cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

# fit pipeline to the training data
cv_model = cv.fit(train_data)

# make predictions on the test data
predictions = cv_model.transform(test_data)

In [36]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

# Calculate F1 score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

print("Accuracy: {:.4f}".format(accuracy))
print("F1 score: {:.4f}".format(f1_score))
print("Recall: {:.4f}".format(recall))
print("Precision: {:.4f}".format(precision))


Accuracy: 0.8234
F1 score: 0.8227
Recall: 0.8234
Precision: 0.8303


In [37]:
from pyspark.mllib.evaluation import MulticlassMetrics

# Convert the predictions and labels to an RDD
predictionAndLabels = predictions.select("prediction", "label_index").rdd.map(lambda r: (r[0], r[1]))

# Instantiate a MulticlassMetrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Get the confusion matrix as a NumPy array
confusion_matrix = metrics.confusionMatrix().toArray()

# Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix)



Confusion matrix:
[[1324.  160.]
 [ 372. 1156.]]


In [38]:
# Get the best model from the cross-validation process
best_model = cv_model.bestModel

# Save the my_model
best_model.save("my_logistic_regression2")


In [45]:

from pyspark.ml import PipelineModel
globals()['models_loaded'] = False
globals()['my_model'] = None

global results
results = []

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df = df.withColumn("label", col("label").cast("float"))
    df.show()
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = PipelineModel.load('my_logistic_regression2')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model (uncomment below):
    
    df_result = globals()['my_model'].transform(df)
    df_result.select('label', 'review_text', 'prediction','probability', 'label_index').show()
    
    collected_results = df_result.select('prediction', 'label_index').collect()
    results.extend(collected_results)
    
    # If we have collected 10 results, show the data and clear the results list
    display_results(results)


def display_results(results):
    result_df = spark.createDataFrame(results)
    predictionAndLabels = result_df.select("prediction", "label_index").rdd.map(lambda r: (r[0], r[1]))
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    print("Confusion matrix:")
    print(confusion_matrix)

In [48]:
ssc = StreamingContext(sc, 10)

In [49]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [50]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
| 605740|  1.0|138882971|                GOod|
|1928090|  0.0|138882691|For now, I don't ...|
| 824600|  1.0|138883414|amazing game. It'...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|                GOod|       0.0|[0.63362784686620...|        0.0|
|  0.0|For now, I don't ...|       1.0|[0.05068748730527...|        1.0|
|  1.0|amazing game. It'...|       0.0|[0.70612767136726...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[2. 0.]
 [0. 1.]]
+------+-----+---------+--------------------+
|app_id|label|review_id|         review_text|
+------+-----+---------+-----

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|A short and sweet...|       0.0|[0.78246162934709...|        0.0|
|  1.0|               10/10|       0.0|[0.78905650341143...|        0.0|
|  1.0|                   p|       0.0|[0.57940247423050...|        0.0|
|  0.0|I've tried 3 runs...|       1.0|[0.14929982587313...|        1.0|
|  0.0|I think this game...|       1.0|[0.05165264794829...|        1.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[16.  0.]
 [ 1.  9.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1062810|  1.0|138883590|      unique and fun|
|1062810|  1.0|138882916|This is a fantast...|
|1062810|  1.0|138882797|Gameplay loop is ...|
|1744610|  1.0|1388

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|Amazing. One of t...|       0.0|[0.91364494735496...|        0.0|
|  1.0|The first level o...|       0.0|[0.50113669327129...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[38.  0.]
 [ 1. 13.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|2005010|  1.0|138883888|The retro styled ...|
|2005010|  1.0|138883885|SUFFER NOT THE HE...|
|2005010|  1.0|138883866|             Pew Pew|
|2005010|  1.0|138883854|First you get the...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_inde

Confusion matrix:
[[58.  2.]
 [ 2. 13.]]
+------+-----+---------+----------------+
|app_id|label|review_id|     review_text|
+------+-----+---------+----------------+
|673750|  1.0|138885237|goofy bunny game|
+------+-----+---------+----------------+

+-----+----------------+----------+--------------------+-----------+
|label|     review_text|prediction|         probability|label_index|
+-----+----------------+----------+--------------------+-----------+
|  1.0|goofy bunny game|       0.0|[0.58107589207678...|        0.0|
+-----+----------------+----------+--------------------+-----------+

Confusion matrix:
[[59.  2.]
 [ 2. 13.]]
+------+-----+---------+--------------------+
|app_id|label|review_id|         review_text|
+------+-----+---------+--------------------+
|705040|  0.0|138885812|There's no amount...|
|705040|  1.0|138884678|Very solid game a...|
+------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|    

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|The feeling of cl...|       1.0|[0.43403893509722...|        0.0|
|  1.0|Even the tutorial...|       0.0|[0.84858423152974...|        0.0|
|  1.0|Really good to pl...|       0.0|[0.72004039353363...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[77.  5.]
 [ 2. 14.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1304930|  1.0|138886138|Really good for E...|
|1304930|  1.0|138886072|if you like being...|
|1304930|  1.0|138886060|                ...,|
|1304930|  1.0|138886044|- found syringe \...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+----------

In [51]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
