In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import wget
from pyspark.ml.feature import Bucketizer,RegexTokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
#Spark Session creation configured to interact with Kfka and MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-avro_2.12:3.0.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.0").\
config("spark.mongodb.input.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
config("spark.mongodb.output.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
getOrCreate()

:: loading settings :: url = jar:file:/opt/homebrew/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/daoanhtuan/.ivy2/cache
The jars for the packages stored in: /Users/daoanhtuan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-96986f73-2a21-438e-abef-c0fc2fb9ec77;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found org.apache.spark#spark-avro_2.12;3.0.0 in central
	fo

22/10/05 22:53:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
#spark.read.json("reviews_Sports_and_Outdoors_5.json.gz").show(35)

In [4]:
#Download dataset if not exists and read it as spark dataframe
try:
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")
except Exception as e:
    url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz"
    wget.download(url)
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")

df = df0.withColumn("text",concat(col("summary"), lit(" "),col("reviewText")))\
 .drop("helpful")\
 .drop("reviewerID")\
 .drop("reviewerName")\
 .drop("reviewTime")
df.count()

                                                                                

296337

In [5]:
df.describe("overall").show()


[Stage 4:>                                                          (0 + 1) / 1]

+-------+------------------+
|summary|           overall|
+-------+------------------+
|  count|            296337|
|   mean| 4.393450699710128|
| stddev|0.9869053992908551|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+




                                                                                

In [6]:
#Bucketize data and create labels 0 if overall rating is in (1.0,2.0), otherwise 1
df1 = df.filter("overall !=3")

splits = [-float("inf"), 4.0, float("inf")]

bucketizer = Bucketizer(splits=splits, inputCol="overall", outputCol="label")

df2= bucketizer.transform(df1)

df2.groupBy("overall","label").count().show()


[Stage 7:>                                                          (0 + 1) / 1]

+-------+-----+------+
|overall|label| count|
+-------+-----+------+
|    2.0|  0.0| 10204|
|    5.0|  1.0|188208|
|    1.0|  0.0|  9045|
|    4.0|  1.0| 64809|
+-------+-----+------+




                                                                                

In [7]:
#take sample to create train and test dataset
fractions = {1.0 : .1, 0.0 : 1.0}
df3 = df2.stat.sampleBy("label", fractions, 36)
df3.groupBy("label").count().show()


[Stage 10:>                                                         (0 + 1) / 1]

+-----+-----+
|label|count|
+-----+-----+
|  0.0|19249|
|  1.0|25224|
+-----+-----+




                                                                                

In [8]:
#Split data as 80-20% Train and Test dataset
splitSeed = 5043
trainingData, testData = df3.randomSplit([0.8, 0.2], splitSeed)

In [9]:
#Tokenize 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")

remover = StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="reviewTokensUf",outputCol="reviewTokens")

22/10/05 22:53:32 WARN StopWordsRemover: Default locale set was [en_FR]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [10]:
#converts word documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTokens",outputCol="cv",vocabSize=296337)

In [11]:
#IDF model
idf = IDF(inputCol="cv",outputCol="features")

In [12]:
lr = LogisticRegression(maxIter=100,regParam=0.02,elasticNetParam=0.3)

In [13]:
#Creates a pipeline
steps =  [tokenizer, remover, cv, idf,lr]
pipeline = Pipeline(stages=steps)

In [14]:
model = pipeline.fit(trainingData)

                                                                                

22/10/05 22:53:52 WARN DAGScheduler: Broadcasting large task binary with size 1997.2 KiB


                                                                                

22/10/05 22:53:57 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB



[Stage 19:>                                                         (0 + 1) / 1]

22/10/05 22:54:02 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/10/05 22:54:02 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/10/05 22:54:02 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/10/05 22:54:02 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS



                                                                                

22/10/05 22:54:02 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:02 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:02 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:03 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:03 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:03 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:03 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:03 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:04 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:04 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:04 WARN DAGScheduler: Broadcasting large task binary with size 1998.7 KiB
22/10/05 22:54:04 WAR

In [15]:
#collecting all metrics
vocabulary = model.stages[2].vocabulary
weights = model.stages[-1].coefficients.toArray()
weights = [float(weight) for weight in weights]

In [16]:
schema = StructType([StructField('word', StringType()),
                     StructField('weight', FloatType())
                     ])
cdf = spark.createDataFrame(zip(vocabulary, weights), schema)

In [17]:
cdf.orderBy(desc("weight")).show(10)


[Stage 63:>                                                         (0 + 8) / 8]

+---------+----------+
|     word|    weight|
+---------+----------+
|    great|0.58763325|
|   thoses|0.32552686|
|  perfect| 0.3234427|
|     easy|0.26150194|
|   highly|0.25427526|
|     love|0.23298806|
|excellent|0.22146559|
|     nice|0.21586911|
|     good|0.20863196|
|    works|0.20269915|
+---------+----------+
only showing top 10 rows




                                                                                

In [18]:
cdf.orderBy("weight").show(10)

+-------------+-----------+
|         word|     weight|
+-------------+-----------+
|     returned| -0.3884238|
|         poor|   -0.33077|
|      useless| -0.3029925|
|        waste|-0.27847156|
|        broke| -0.2696688|
|         junk| -0.2493959|
|       return|-0.24829988|
|disappointing|-0.22996198|
|    returning| -0.2170597|
| disappointed| -0.2141453|
+-------------+-----------+
only showing top 10 rows



In [19]:
predictions = model.transform(testData)

In [20]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(predictions)

22/10/05 22:55:32 WARN DAGScheduler: Broadcasting large task binary with size 2015.4 KiB


                                                                                

In [21]:
predictions.show()

22/10/05 22:55:36 WARN DAGScheduler: Broadcasting large task binary with size 2035.8 KiB



[Stage 74:>                                                         (0 + 1) / 1]

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|            features|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|7245456313|    1.0|I wish I would ha...|Defective - Be Ca...|    1354492800|Defective - Be Ca...|  0.0|[defective, -, be...|[defective, -, ca...|(71899,[0,11,15,1...|(71899,[0,11,15,1...|[1.99664552031553...|[0.88044442899395...|       0.0|
|7245456313|    5.0|I bought thi


                                                                                

In [22]:
#model evaluation
lp = predictions.select("label", "prediction")
counttotal = predictions.count()
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter(~(col("label") == col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
lp = predictions.select(  "prediction","label")
counttotal = float(predictions.count())
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter("label != prediction").count()
ratioWrong=wrong/counttotal
ratioCorrect=correct/counttotal
trueneg =( lp.filter(col("label") == 0.0).filter(col("label") == col("prediction")).count()) /counttotal
truepos = (lp.filter(col("label") == 1.0).filter(col("label") == col("prediction")).count())/counttotal
falseneg = (lp.filter(col("label") == 0.0).filter(~(col("label") == col("prediction"))).count())/counttotal
falsepos = (lp.filter(col("label") == 1.0).filter(~(col("label") == col("prediction"))).count())/counttotal

precision= truepos / (truepos + falsepos)
recall= truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

                                                                                

22/10/05 22:55:55 WARN DAGScheduler: Broadcasting large task binary with size 2012.2 KiB


                                                                                

22/10/05 22:55:58 WARN DAGScheduler: Broadcasting large task binary with size 2012.4 KiB


                                                                                

22/10/05 22:56:03 WARN DAGScheduler: Broadcasting large task binary with size 2012.2 KiB


                                                                                

22/10/05 22:56:06 WARN DAGScheduler: Broadcasting large task binary with size 2012.4 KiB


                                                                                

22/10/05 22:56:08 WARN DAGScheduler: Broadcasting large task binary with size 2012.5 KiB


                                                                                

22/10/05 22:56:11 WARN DAGScheduler: Broadcasting large task binary with size 2012.5 KiB


                                                                                

22/10/05 22:56:13 WARN DAGScheduler: Broadcasting large task binary with size 2012.6 KiB


                                                                                

22/10/05 22:56:15 WARN DAGScheduler: Broadcasting large task binary with size 2012.6 KiB


                                                                                

In [23]:
print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 9003.0
correct      : 7776
wrong        : 1227
ratioWrong   : 0.13628790403198934
ratioCorrect : 0.8637120959680107
truen        : 0.3361101854937243
truep        : 0.5276019104742864
falsen       : 0.08863712095968011
falsep       : 0.04765078307230923
precision    : 0.9171654759606103
recall       : 0.8561643835616438
accuracy     : 0.8637120959680107


In [24]:
predictions.filter(col("prediction") == 0.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy(desc("rawPrediction")).show(5)

22/10/05 22:56:18 WARN DAGScheduler: Broadcasting large task binary with size 2026.1 KiB



[Stage 105:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|Buyer Beware - Yo...|[buyer, beware, -...|    2.0|       0.0|
|Awful Phone and T...|[awful, phone, te...|    1.0|       0.0|
|DO NOT BUY HERE I...|[buy, need, custo...|    1.0|       0.0|
|                JUNK|[junk, well, rece...|    1.0|       0.0|
|Poor 3-9x40 Hamme...|[poor, 3-9x40, ha...|    1.0|       0.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows




                                                                                

In [25]:
predictions.filter(col("prediction")== 1.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy("rawPrediction").show(5)

22/10/05 22:56:21 WARN DAGScheduler: Broadcasting large task binary with size 2026.0 KiB



[Stage 106:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|My DROID Story an...|[droid, story, co...|    5.0|       1.0|
| great trucker phone|[great, trucker, ...|    5.0|       1.0|
|    Favorite EDC Bag|[favorite, edc, b...|    4.0|       1.0|
|One of My Favorit...|[one, favorites!!...|    4.0|       1.0|
|Best Hopper I've ...|[best, hopper, us...|    4.0|       1.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows




                                                                                

In [26]:
dir = "sentiment/"
model.write().overwrite().save(dir)

22/10/05 22:56:27 WARN TaskSetManager: Stage 111 contains a task of very large size (1382 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/05 22:56:29 WARN TaskSetManager: Stage 115 contains a task of very large size (1151 KiB). The maximum recommended task size is 1000 KiB.



[Stage 121:>                                                        (0 + 1) / 1]

                                                                                

In [27]:
dir = "sentiment/"
model = PipelineModel.load(dir)

22/10/05 22:58:07 WARN StopWordsRemover: Default locale set was [en_FR]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.
22/10/05 22:58:09 WARN StopWordsRemover: Default locale set was [en_FR]; however, it was not found in available locales in JVM, falling back to en_US locale. Set param `locale` in order to respect another locale.


In [None]:
df = spark.read.format("mongo").load()
df.printSchema()

In [None]:
df = spark.read.format("mongo").load().select("timestamp_ms","text")
splits = [-float("inf"), 0, float("inf")]
#bucketizer = Bucketizer(inputCol="timestamp_ms",outputCol="sentiment",splits=splits)

#df5= bucketizer.transform(df)
predictions = model.transform(df)
predictions.select('text','prediction').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------+----------+
|text                                                                                                                         |prediction|
+-----------------------------------------------------------------------------------------------------------------------------+----------+
| done &lt;3
I Ten lub Jungwoo &lt;33                                                                                         |1.0       |
| Well done you 👍👍👍                                                                                                        |1.0       |
| ElbeDay 25th April 1945                                                                                                     |1.0       |
|We letting them titties tittie today 🤪 What’s a bra? Oooooh okay!                                                           |1.0       |
|  "Self-care isn't always bath