In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):

        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

# build model

In [4]:
from pyspark.streaming import StreamingContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, HashingTF
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import expr
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.sql.functions import split, rand
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

file_location="reviews.csv"

In [5]:
# create a SparkSession 
spark = SparkSession.builder \
    .appName("assignment 3") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.cores", "2") \
    .getOrCreate()

In [12]:
# read csv

text_df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("multiline", "true") \
    .option("delimiter", "\t") \
    .csv(file_location)

text_df = text_df.select(col('review_text'), col('label'))
text_df.show()

+--------------------+-----+
|         review_text|label|
+--------------------+-----+
|i can confirm tha...|  1.0|
|Really good game,...|  1.0|
|Its not finished ...|  1.0|
|Hey. It's really ...|  1.0|
|          Fun so far|  1.0|
|I'd rather play W...|  0.0|
|I have been playi...|  1.0|
|Nice game! Loads ...|  1.0|
|All hail NA serve...|  1.0|
|enjoying it so fa...|  1.0|
|This game came as...|  1.0|
|         Great fun  |  1.0|
|Fantastic consept...|  1.0|
|You know the game...|  1.0|
|EDIT: My issues h...|  1.0|
|AMAZIIIIIIIIIIING...|  1.0|
|In its current st...|  0.0|
|Играю в Калибр го...|  1.0|
|Secret Word: Prou...|  1.0|
|Fated Word: Death...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [13]:
# Remove duplicate rows
text_df = text_df.dropDuplicates()
# Remove rows with missing values
text_df = text_df.na.drop()

In [14]:
text_df.select("label").distinct().show()

+--------------------+
|               label|
+--------------------+
|                 1.0|
|                 0.0|
|It's been at leas...|
|[td]✔️ Exciting p...|
|- I literally onl...|
+--------------------+



In [15]:
# Remove rows with unexpected labels
text_df = text_df.filter((col("label") == 1.0) | (col("label") == 0.0))
print("Number of rows after filtering: ", text_df.count())
print("Number of rows after filtering with 1 labelled: ", text_df.filter(col("label") == 1.0).count())
print("Number of rows after filtering with 0 labelled: ", text_df.filter(col("label") == 0.0).count())

Number of rows after filtering:  36913
Number of rows after filtering with 1 labelled:  29406
Number of rows after filtering with 0 labelled:  7507


In [16]:
# make balanced dataset
text_df_0 = text_df.filter(col("label") == 0.0)  
text_df_1 = text_df.filter(col("label") == 1.0) 
sampled_text_df_1 = text_df_1.orderBy(rand()).limit(text_df_0.count())
balance_text_df = text_df_0.union(sampled_text_df_1)
print("1 labelled: ", balance_text_df.filter(col("label") == 1.0).count())
print("0 labelled: ", balance_text_df.filter(col("label") == 0.0).count())
                        

1 labelled:  7507
0 labelled:  7507


In [17]:
# Split data set
train_data, test_data = balance_text_df.randomSplit([0.8, 0.2], seed=7)

In [18]:
# preprocess the data
tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words", locale="en_US")
count_vectorizer = CountVectorizer(inputCol="filtered_words", outputCol="raw_features")
idf = IDF(inputCol="raw_features", outputCol="features")
string_indexer = StringIndexer(inputCol="label", outputCol="label_index")

# create model
lr = LogisticRegression(featuresCol="features", labelCol="label_index")

In [19]:
# define params grid
param_grid = ParamGridBuilder() \
   .addGrid(count_vectorizer.vocabSize, [1000, 5000]) \
   .addGrid(lr.regParam, [0.01, 0.1]) \
   .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
   .build()

# define the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction")

In [35]:
# create pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover, count_vectorizer, idf, string_indexer, lr])

# define the cross-validator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=3)

# fit pipeline to the training data
cv_model = cv.fit(train_data)

# make predictions on the test data
predictions = cv_model.transform(test_data)

In [36]:
evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction")

# Calculate accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})

# Calculate F1 score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})

# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})

print("Accuracy: {:.4f}".format(accuracy))
print("F1 score: {:.4f}".format(f1_score))
print("Recall: {:.4f}".format(recall))
print("Precision: {:.4f}".format(precision))


Accuracy: 0.8234
F1 score: 0.8227
Recall: 0.8234
Precision: 0.8303


In [37]:
from pyspark.mllib.evaluation import MulticlassMetrics

# Convert the predictions and labels to an RDD
predictionAndLabels = predictions.select("prediction", "label_index").rdd.map(lambda r: (r[0], r[1]))

# Instantiate a MulticlassMetrics object
metrics = MulticlassMetrics(predictionAndLabels)

# Get the confusion matrix as a NumPy array
confusion_matrix = metrics.confusionMatrix().toArray()

# Print the confusion matrix
print("Confusion matrix:")
print(confusion_matrix)



Confusion matrix:
[[1324.  160.]
 [ 372. 1156.]]


In [38]:
# Get the best model from the cross-validation process
best_model = cv_model.bestModel

# Save the my_model
best_model.save("my_logistic_regression2")


In [5]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [6]:

from pyspark.ml import PipelineModel
globals()['models_loaded'] = False
globals()['my_model'] = None

global results
results = []

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df = df.withColumn("label", col("label").cast("float"))
    df.show()
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = PipelineModel.load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model (uncomment below):
    
    df_result = globals()['my_model'].transform(df)
    df_result.select('label', 'review_text', 'prediction','probability', 'label_index').show()
    
    collected_results = df_result.select('prediction', 'label_index').collect()
    results.extend(collected_results)
    
    # If we have collected 10 results, show the data and clear the results list
    display_results(results)


def display_results(results):
    result_df = spark.createDataFrame(results)
    predictionAndLabels = result_df.select("prediction", "label_index").rdd.map(lambda r: (r[0], r[1]))
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    print("Confusion matrix:")
    print(confusion_matrix)

In [7]:
ssc = StreamingContext(sc, 10)

In [8]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [9]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1938800|  1.0|139011455|What was good:\n-...|
|1938800|  0.0|139010594|An utter waste of...|
|2146070|  1.0|139010683|[quote][b]Check o...|
|2406990|  0.0|139010130|lidl version of S...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|What was good:\n-...|       0.0|[0.78293282055781...|        0.0|
|  0.0|An utter waste of...|       1.0|[0.01587355745218...|        1.0|
|  1.0|[quote][b]Check o...|       0.0|[0.65220338000149...|        0.0|
|  0.0|lidl version of S...|       0.0|[0.88321833922714...|        1.0|
+-----+--------------------+----------+--------------------+-----------+





Confusion matrix:
[[2. 0.]
 [1. 1.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
| 669330|  1.0|139012214|If you like strat...|
|2369390|  1.0|139013132|This fun game som...|
|2369390|  1.0|139012787| I far-cri evryti...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|If you like strat...|       0.0|[0.94703637375330...|        0.0|
|  1.0|This fun game som...|       0.0|[0.84246787984123...|        0.0|
|  1.0| I far-cri evryti...|       0.0|[0.79337675488982...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[5. 0.]
 [1. 1.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         rev

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
| 754890|  1.0|139012421|Cyan does it agai...|
|1899060|  1.0|139011485|One of my good ol...|
|1590160|  1.0|139013830|I'm seeing mixed ...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|Cyan does it agai...|       0.0|[0.99996213443663...|        0.0|
|  1.0|One of my good ol...|       0.0|[0.99573950173204...|        0.0|
|  1.0|I'm seeing mixed ...|       0.0|[0.91877308390759...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[30.  0.]
 [ 1.  2.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+--------

Confusion matrix:
[[46.  1.]
 [ 3.  4.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1592100|  1.0|139012766|One of the funnie...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|One of the funnie...|       0.0|[0.93785617146693...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[47.  1.]
 [ 3.  4.]]
+-------+-----+---------+-----------+
| app_id|label|review_id|review_text|
+-------+-----+---------+-----------+
|1265780|  1.0|139013467| great game|
+-------+-----+---------+-----------+

+-----+-----------+----------+--------------------+-----------+
|label|review_text|prediction|         probability|label_inde

Confusion matrix:
[[62.  1.]
 [ 4.  8.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1938800|  0.0|139013239|Not even 20 minut...|
|1938800|  1.0|139013146|It's 5 minutes lo...|
|1938800|  1.0|139013083|I have medium to ...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  0.0|Not even 20 minut...|       1.0|[0.00186441881486...|        1.0|
|  1.0|It's 5 minutes lo...|       0.0|[0.98296327457679...|        0.0|
|  1.0|I have medium to ...|       0.0|[0.94380865944628...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[64.  1.]
 [ 4.  9.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|    

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  1.0|127374443|I have no clue wh...|
|1468810|  1.0|127368967|If you love Chine...|
|1468810|  1.0|127358493|                Nice|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|I have no clue wh...|       0.0|[0.54761514905206...|        0.0|
|  1.0|If you love Chine...|       0.0|[0.80483150571909...|        0.0|
|  1.0|                Nice|       0.0|[0.91228258299323...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[81.  2.]
 [10. 11.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+--------

Confusion matrix:
[[92.  3.]
 [20. 12.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  0.0|114657619|Suspected of seri...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  0.0|Suspected of seri...|       1.0|[0.21029705122662...|        1.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[92.  3.]
 [20. 13.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  1.0|111122279|A great game, it'...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|

Confusion matrix:
[[111.   3.]
 [ 21.  13.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  1.0| 99488313|Really love this ...|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|Really love this ...|       0.0|[0.99165776033389...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[112.   3.]
 [ 21.  13.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  1.0| 99486999|This game is a MA...|
|1468810|  1.0| 99486365|1) Finally made i...|
|1468810|  0.0| 99482491|It's really fucki...|
+-------+-----+---------+-

Confusion matrix:
[[128.   3.]
 [ 25.  14.]]
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1468810|  1.0| 87933996|Only very few tim...|
|1468810|  1.0| 87929546|Awesome graphic a...|
|1468810|  1.0| 87926102|           Good game|
+-------+-----+---------+--------------------+

+-----+--------------------+----------+--------------------+-----------+
|label|         review_text|prediction|         probability|label_index|
+-----+--------------------+----------+--------------------+-----------+
|  1.0|Only very few tim...|       0.0|[0.94928422336487...|        0.0|
|  1.0|Awesome graphic a...|       0.0|[0.96259243551428...|        0.0|
|  1.0|           Good game|       0.0|[0.91739395568935...|        0.0|
+-----+--------------------+----------+--------------------+-----------+

Confusion matrix:
[[131.   3.]
 [ 25.  14.]]
+-------+-----+---------+--------------------------------------+
| a

In [10]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
