In [1]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [2]:
sc

In [3]:
spark

In [4]:
import random
from pyspark.streaming import StreamingContext
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

In [40]:
import pandas as pd

In [90]:
df=pd.read_csv("/Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv")
df.shape

(16319, 5)

In [91]:
df.groupby('label').count()

Unnamed: 0_level_0,Unnamed: 0,review_id,app_id,review_text
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,2786,2786,2786,2786
1.0,13532,13532,13532,13514


# build model

In [172]:
from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.sql.functions import split
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

schema = StructType([
    StructField("review_text", StringType(), True),
    StructField("label", DoubleType(), True),
    # add more columns here as needed
])

file_location="/Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv"
#text_df = spark.read.text(file_location)

In [203]:
text_df = spark.read.option("header", "true").option("inferSchema", 
                                                     "true").option("multiLine", "true").csv(file_location)

In [194]:
# Get the number of rows and columns in the DataFrame
num_rows = text_df.count()
num_cols = len(text_df.columns)

# Print the shape of the DataFrame
print("Shape of the DataFrame: (%d, %d)" % (num_rows, num_cols))

Shape of the DataFrame: (19941, 5)


In [195]:
text_df.show(5)

23/05/02 13:59:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
+---+-----------+-------+--------------------+-----+
|_c0|  review_id| app_id|         review_text|label|
+---+-----------+-------+--------------------+-----+
|  0|136524320.0|2008820|i can confirm tha...|  1.0|
|  1|136524365.0|2008820|Really good game,...|  1.0|
|  2|116734222.0|1729900|Its not finished ...|  1.0|
|  3|116989907.0|1729900|Hey. It's really ...|  1.0|
|  4|136568878.0| 307950|          Fun so far|  1.0|
+---+-----------+-------+--------------------+-----+
only showing top 5 rows



In [217]:
# Remove duplicate rows
text_df = text_df.dropDuplicates()
# Remove rows with missing values
text_df = text_df.na.drop()
# Remove rows with unexpected values
text_df = text_df.filter(text_df["label"]>=0)

text_df = text_df.filter(text_df["label"]<=1)

# Drop multiple columns
df = text_df.drop("_c0")

In [218]:
df.show(5)

23/05/02 14:09:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
+-----------+-------+--------------------+-----+
|  review_id| app_id|         review_text|label|
+-----------+-------+--------------------+-----+
|136578721.0|1811990|Amazing art style...|  1.0|
|119483735.0|1527950|            dog shit|  0.0|
|136601345.0|1091920|Very cute click g...|  1.0|
|136613998.0|1527950|Similar to Battle...|  1.0|
|136672689.0|1811990|I love the game s...|  0.0|
+-----------+-------+--------------------+-----+
only showing top 5 rows



In [198]:
# Get the number of rows and columns in the DataFrame
num_rows = df.count()
num_cols = len(df.columns)

# Print the shape of the DataFrame
print("Shape of the DataFrame: (%d, %d)" % (num_rows, num_cols))

23/05/02 13:59:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
Shape of the DataFrame: (15281, 2)


In [199]:
tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens")
df = tokenizer.transform(df)

word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol="tokens", outputCol="embeddings")
model = word2Vec.fit(df)
transformed_df = model.transform(df)

23/05/02 13:59:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


                                                                                

In [200]:
transformed_df.show(5)

23/05/02 13:59:58 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
+--------------------+-----+--------------------+--------------------+
|         review_text|label|              tokens|          embeddings|
+--------------------+-----+--------------------+--------------------+
|Amazing art style...|  1.0|[amazing, art, st...|[0.02256262019121...|
|            dog shit|  0.0|         [dog, shit]|[-0.0909201018512...|
|Very cute click g...|  1.0|[very, cute, clic...|[0.06723331050681...|
|Similar to Battle...|  1.0|[similar, to, bat...|[0.03641914182797...|
|I love the game s...|  0.0|[i, love, the, ga...|[-0.0270881544825...|
+--------------------+-----+--------------------+--------------------+
only showing top 5 rows



In [204]:
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import when

# had to convert string to numeric
label_indexer = StringIndexer(inputCol="label", outputCol="label_index", stringOrderType="frequencyDesc")
transformed_df = label_indexer.fit(transformed_df).transform(transformed_df)
transformed_df = transformed_df.withColumn("label", when(transformed_df["label"] == 1, 0).otherwise(1))
transformed_df = transformed_df.drop("label")
transformed_df = transformed_df.withColumnRenamed("label_index", "label")

23/05/02 14:01:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


In [209]:
unique_values = transformed_df.select("label").distinct().collect()

# Print the unique values
for row in unique_values:
    print(row[0])

23/05/02 14:05:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


[Stage 744:>                                                        (0 + 1) / 1]

0.0
1.0


                                                                                

In [210]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import DenseMatrix

train_data, test_data = transformed_df.randomSplit([0.8, 0.2], seed=123)
# Create a random forest classifier and fit it to the training data
rf = RandomForestClassifier(featuresCol="embeddings", labelCol="label", numTrees=10)
rf_model = rf.fit(train_data)

# Make predictions on the test data
predictions = rf_model.transform(test_data)

23/05/02 14:05:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
23/05/02 14:05:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


                                                                                

23/05/02 14:06:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
23/05/02 14:06:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


In [211]:
# Compute the class distribution of the train data
train_class_distribution = train_data.groupBy("label").count().orderBy("label").collect()
print("Train class distribution:")
for row in train_class_distribution:
    print("Class %d: %d" % (row["label"], row["count"]))

# Compute the class distribution of the test data
test_class_distribution = test_data.groupBy("label").count().orderBy("label").collect()
print("Test class distribution:")
for row in test_class_distribution:
    print("Class %d: %d" % (row["label"], row["count"]))

23/05/02 14:06:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


[Stage 778:>                                                        (0 + 2) / 2]                                                                                

Train class distribution:
Class 0: 10209
Class 1: 1978
23/05/02 14:06:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
Test class distribution:
Class 0: 2579
Class 1: 515


In [213]:
# Compute the accuracy and F1 score of the predictions
accuracy = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy").evaluate(predictions)
f1_score = MulticlassClassificationEvaluator(labelCol="label", metricName="f1").evaluate(predictions)

# Print the accuracy and F1 score
print("Accuracy:", accuracy)
print("F1 score:", f1_score)

23/05/02 14:06:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv


[Stage 804:>                                                        (0 + 2) / 2]                                                                                

23/05/02 14:06:26 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, app_id, review_text, label
 Schema: _c0, review_id, app_id, review_text, label
Expected: _c0 but found: 
CSV file: file:///Users/wentingjiang/Desktop/Projects/advanced_analytics_kul/assignment3/reviews.csv
Accuracy: 0.8335488041370395
F1 score: 0.7578785002183763


[Stage 808:>                                                        (0 + 2) / 2]                                                                                

In [5]:
globals()['models_loaded'] = False
globals()['my_model'] = None

# Toy predict function that returns a random probability. Normally you'd use your loaded globals()['my_model'] here
def predict(df):
    return random.random()

predict_udf = udf(predict, StringType())

def process(time, rdd):
    if rdd.isEmpty():
        return
    
    print("========= %s =========" % str(time))
    
    # Convert to data frame
    df = spark.read.json(rdd)
    df.show()
    
    # Utilize our predict function
    df_withpreds = df.withColumn("pred", predict_udf(
        struct([df[x] for x in df.columns])
    ))
    df_withpreds.show()
    
    # Normally, you wouldn't use a UDF (User Defined Function) Python function to predict as we did here (you can)
    # but an MLlib model you've built and saved with Spark
    # In this case, you need to prevent loading your model in every call to "process" as follows:
    
    # Load in the model if not yet loaded:
    if not globals()['models_loaded']:
        # load in your models here
        globals()['my_model'] = '***' # Replace '***' with:    [...].load('my_logistic_regression')
        globals()['models_loaded'] = True
        
    # And then predict using the loaded model (uncomment below):
    
    # df_result = globals()['my_model'].transform(df)
    # df_result.show()

In [6]:
ssc = StreamingContext(sc, 10)

In [7]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

In [8]:
ssc_t = StreamingThread(ssc)
ssc_t.start()

[Stage 0:>                                                          (0 + 1) / 1]

23/05/02 12:33:41 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:41 WARN BlockManager: Block input-0-1683023620800 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:33:46 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:46 WARN BlockManager: Block input-0-1683023626000 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:33:49 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:49 WARN BlockManager: Block input-0-1683023629000 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:33:50 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:50 WARN BlockManager: Block input-0-1683023630000 replicated to only 0 peer(s) instead of 1 peers


[Stage 0:>                  (0 + 1) / 1][Stage 1:>                  (0 + 1) / 1]                                                                                



                                                                                

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1493750|    1|137664363|       its nice game|
|1321440|    1|137664455|Nostalgic feel wh...|
|1321440|    1|137664392|Very very good Po...|
+-------+-----+---------+--------------------+



                                                                                

+-------+-----+---------+--------------------+------------------+
| app_id|label|review_id|         review_text|              pred|
+-------+-----+---------+--------------------+------------------+
|1493750|    1|137664363|       its nice game|0.6565145595554269|
|1321440|    1|137664455|Nostalgic feel wh...|0.5485765583870403|
|1321440|    1|137664392|Very very good Po...|0.7773911340974549|
+-------+-----+---------+--------------------+------------------+

23/05/02 12:33:55 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:55 WARN BlockManager: Block input-0-1683023635000 replicated to only 0 peer(s) instead of 1 peers


[Stage 0:>                                                          (0 + 1) / 1]

23/05/02 12:33:56 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:56 WARN BlockManager: Block input-0-1683023635800 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:33:59 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:33:59 WARN BlockManager: Block input-0-1683023639000 replicated to only 0 peer(s) instead of 1 peers


                                                                                

23/05/02 12:34:00 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:00 WARN BlockManager: Block input-0-1683023640000 replicated to only 0 peer(s) instead of 1 peers
+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1321440|    1|137664016|At first, I'd ass...|
|1321440|    1|137663856|Might be one of i...|
|1321440|    1|137663302|This game is Awes...|
|2358520|    1|137663676|[b][i]A spiritual...|
+-------+-----+---------+--------------------+

+-------+-----+---------+--------------------+-------------------+
| app_id|label|review_id|         review_text|               pred|
+-------+-----+---------+--------------------+-------------------+
|1321440|    1|137664016|At first, I'd ass...|  0.745487900932301|
|1321440|    1|137663856|Might be one of i...|0.04498138095119997|
|1321440|    1|137663302|This game is Awes...|  0.577938675852273|
|2358520|    1

[Stage 0:>                                                          (0 + 1) / 1]

23/05/02 12:34:01 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:01 WARN BlockManager: Block input-0-1683023641000 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:34:02 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:02 WARN BlockManager: Block input-0-1683023642000 replicated to only 0 peer(s) instead of 1 peers


                                                                                

+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
|1869590|    1|137664702|           tomboy gf|
|1869590|    1|137664488|the game itself i...|
|1869590|    1|137663723|This game is so a...|
+-------+-----+---------+--------------------+

+-------+-----+---------+--------------------+-------------------+
| app_id|label|review_id|         review_text|               pred|
+-------+-----+---------+--------------------+-------------------+
|1869590|    1|137664702|           tomboy gf|0.41284213761536503|
|1869590|    1|137664488|the game itself i...| 0.9063430352795419|
|1869590|    1|137663723|This game is so a...|0.21920045015905576|
+-------+-----+---------+--------------------+-------------------+



[Stage 0:>                                                          (0 + 1) / 1]

23/05/02 12:34:28 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:28 WARN BlockManager: Block input-0-1683023668200 replicated to only 0 peer(s) instead of 1 peers


                                                                                

+-------+-----+---------+------------------+
| app_id|label|review_id|       review_text|
+-------+-----+---------+------------------+
|1607240|    1|137663740|soks for president|
+-------+-----+---------+------------------+

+-------+-----+---------+------------------+------------------+
| app_id|label|review_id|       review_text|              pred|
+-------+-----+---------+------------------+------------------+
|1607240|    1|137663740|soks for president|0.9786878975368976|
+-------+-----+---------+------------------+------------------+



[Stage 0:>                                                          (0 + 1) / 1]

23/05/02 12:34:31 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:31 WARN BlockManager: Block input-0-1683023671000 replicated to only 0 peer(s) instead of 1 peers
23/05/02 12:34:35 WARN RandomBlockReplicationPolicy: Expecting 1 replicas with only 0 peer/s.
23/05/02 12:34:35 WARN BlockManager: Block input-0-1683023675000 replicated to only 0 peer(s) instead of 1 peers


In [10]:
ssc_t.stop()

----- Stopping... this may take a few seconds -----
23/05/02 12:43:28 WARN StreamingContext: StreamingContext has already been stopped
