In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import wget
from pyspark.ml.feature import Bucketizer,RegexTokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
#Spark Session creation configured to interact with Kfka and MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-avro_2.12:3.0.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.0").\
config("spark.mongodb.input.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
config("spark.mongodb.output.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
getOrCreate()

In [None]:
#spark.read.json("reviews_Sports_and_Outdoors_5.json.gz").show(35)

In [None]:
#Download dataset if not exists and read it as spark dataframe
try:
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")
except Exception as e:
    url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz"
    wget.download(url)
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")

df = df0.withColumn("text",concat(col("summary"), lit(" "),col("reviewText")))\
 .drop("helpful")\
 .drop("reviewerID")\
 .drop("reviewerName")\
 .drop("reviewTime")
df.count()

296337

In [None]:
df.describe("overall").show()

+-------+------------------+
|summary|           overall|
+-------+------------------+
|  count|            296337|
|   mean| 4.393450699710128|
| stddev|0.9869053992908551|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



In [None]:
#Bucketize data and create labels 0 if overall rating is in (1.0,2.0), otherwise 1
df1 = df.filter("overall !=3")

splits = [-float("inf"), 4.0, float("inf")]

bucketizer = Bucketizer(splits=splits, inputCol="overall", outputCol="label")

df2= bucketizer.transform(df1)

df2.groupBy("overall","label").count().show()

+-------+-----+------+
|overall|label| count|
+-------+-----+------+
|    2.0|  0.0| 10204|
|    5.0|  1.0|188208|
|    1.0|  0.0|  9045|
|    4.0|  1.0| 64809|
+-------+-----+------+



In [None]:
#take sample to create train and test dataset
fractions = {1.0 : .1, 0.0 : 1.0}
df3 = df2.stat.sampleBy("label", fractions, 36)
df3.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|19249|
|  1.0|25224|
+-----+-----+



In [None]:
#Split data as 80-20% Train and Test dataset
splitSeed = 5043
trainingData, testData = df3.randomSplit([0.8, 0.2], splitSeed)

In [None]:
#Tokenize 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")

remover = StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="reviewTokensUf",outputCol="reviewTokens")

In [None]:
#converts word documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTokens",outputCol="cv",vocabSize=296337)

In [None]:
#IDF model
idf = IDF(inputCol="cv",outputCol="features")

In [None]:
lr = LogisticRegression(maxIter=100,regParam=0.02,elasticNetParam=0.3)

In [None]:
#Creates a pipeline
steps =  [tokenizer, remover, cv, idf,lr]
pipeline = Pipeline(stages=steps)

In [None]:
model = pipeline.fit(trainingData)

In [None]:
#collecting all metrics
vocabulary = model.stages[2].vocabulary
weights = model.stages[-1].coefficients.toArray()
weights = [float(weight) for weight in weights]

In [None]:
schema = StructType([StructField('word', StringType()),
                     StructField('weight', FloatType())
                     ])
cdf = spark.createDataFrame(zip(vocabulary, weights), schema)

In [None]:
cdf.orderBy(desc("weight")).show(10)

+---------+----------+
|     word|    weight|
+---------+----------+
|    great| 0.5876225|
|   thoses|  0.325535|
|  perfect|0.32343474|
|     easy| 0.2615016|
|   highly|0.25427502|
|     love|0.23299988|
|excellent|0.22146676|
|     nice|0.21586789|
|     good|0.20862874|
|    works|0.20269535|
+---------+----------+
only showing top 10 rows



In [None]:
cdf.orderBy("weight").show(10)

+-------------+-----------+
|         word|     weight|
+-------------+-----------+
|     returned|-0.38842562|
|         poor|-0.33077022|
|      useless|-0.30299458|
|        waste|-0.27846226|
|        broke|-0.26966578|
|         junk| -0.2493974|
|       return|-0.24831308|
|disappointing|-0.22999014|
|    returning|-0.21706156|
| disappointed|-0.21414408|
+-------------+-----------+
only showing top 10 rows



In [None]:
predictions = model.transform(testData)

In [None]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(predictions)

In [None]:
predictions.show()

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|            features|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|7245456313|    1.0|I wish I would ha...|Defective - Be Ca...|    1354492800|Defective - Be Ca...|  0.0|[defective, -, be...|[defective, -, ca...|(71899,[0,11,15,1...|(71899,[0,11,15,1...|[1.99668098749145...|[0.88044816229074...|       0.0|
|7245456313|    5.0|I bought thi

In [None]:
#model evaluation
lp = predictions.select("label", "prediction")
counttotal = predictions.count()
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter(~(col("label") == col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
lp = predictions.select(  "prediction","label")
counttotal = float(predictions.count())
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter("label != prediction").count()
ratioWrong=wrong/counttotal
ratioCorrect=correct/counttotal
trueneg =( lp.filter(col("label") == 0.0).filter(col("label") == col("prediction")).count()) /counttotal
truepos = (lp.filter(col("label") == 1.0).filter(col("label") == col("prediction")).count())/counttotal
falseneg = (lp.filter(col("label") == 0.0).filter(~(col("label") == col("prediction"))).count())/counttotal
falsepos = (lp.filter(col("label") == 1.0).filter(~(col("label") == col("prediction"))).count())/counttotal

precision= truepos / (truepos + falsepos)
recall= truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

In [None]:
print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 9003.0
correct      : 7776
wrong        : 1227
ratioWrong   : 0.13628790403198934
ratioCorrect : 0.8637120959680107
truen        : 0.3361101854937243
truep        : 0.5276019104742864
falsen       : 0.08863712095968011
falsep       : 0.04765078307230923
precision    : 0.9171654759606103
recall       : 0.8561643835616438
accuracy     : 0.8637120959680107


In [None]:
predictions.filter(col("prediction") == 0.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy(desc("rawPrediction")).show(5)

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|Buyer Beware - Yo...|[buyer, beware, -...|    2.0|       0.0|
|Awful Phone and T...|[awful, phone, te...|    1.0|       0.0|
|DO NOT BUY HERE I...|[buy, need, custo...|    1.0|       0.0|
|                JUNK|[junk, well, rece...|    1.0|       0.0|
|Poor 3-9x40 Hamme...|[poor, 3-9x40, ha...|    1.0|       0.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows



In [None]:
predictions.filter(col("prediction")== 1.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy("rawPrediction").show(5)

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|My DROID Story an...|[droid, story, co...|    5.0|       1.0|
| great trucker phone|[great, trucker, ...|    5.0|       1.0|
|    Favorite EDC Bag|[favorite, edc, b...|    4.0|       1.0|
|One of My Favorit...|[one, favorites!!...|    4.0|       1.0|
|Best Hopper I've ...|[best, hopper, us...|    4.0|       1.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows



In [None]:
dir = "sentiment/"
model.write().overwrite().save(dir)

In [None]:
dir = "sentiment/"
model = PipelineModel.load(dir)

In [None]:
df = spark.read.format("mongo").load()
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- prediction: double (nullable = true)
 |-- text: string (nullable = true)
 |-- timestamp_ms: string (nullable = true)



In [None]:
df = spark.read.format("mongo").load().select("timestamp_ms","text")
splits = [-float("inf"), 0, float("inf")]
#bucketizer = Bucketizer(inputCol="timestamp_ms",outputCol="sentiment",splits=splits)

#df5= bucketizer.transform(df)
predictions = model.transform(df)
predictions.select('text','prediction').show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------+----------+
|text                                                                                                                         |prediction|
+-----------------------------------------------------------------------------------------------------------------------------+----------+
| done &lt;3
I Ten lub Jungwoo &lt;33                                                                                         |1.0       |
| Well done you 👍👍👍                                                                                                        |1.0       |
| ElbeDay 25th April 1945                                                                                                     |1.0       |
|We letting them titties tittie today 🤪 What’s a bra? Oooooh okay!                                                           |1.0       |
|  "Self-care isn't always bath