In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import wget
from pyspark.ml.feature import Bucketizer,RegexTokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
#Spark Session creation configured to interact with Kfka and MongoDB
spark = SparkSession.builder.appName("pyspark-notebook").\
config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,org.apache.spark:spark-avro_2.12:3.0.0,org.mongodb.spark:mongo-spark-connector_2.12:3.0.0").\
config("spark.mongodb.input.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
config("spark.mongodb.output.uri","mongodb://docker_mongo_1:27017/twitter_db.tweets").\
getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/lib/python3.7/dist-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-avro_2.12 added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f5d01655-b343-4dac-9196-8ebec5593e79;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.0 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in

In [3]:
#spark.read.json("reviews_Sports_and_Outdoors_5.json.gz").show(35)

In [4]:
#Download dataset if not exists and read it as spark dataframe
try:
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")
except Exception as e:
    url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz"
    wget.download(url)
    df0 = spark.read.json("reviews_Sports_and_Outdoors_5.json.gz")

df = df0.withColumn("text",concat(col("summary"), lit(" "),col("reviewText")))\
 .drop("helpful")\
 .drop("reviewerID")\
 .drop("reviewerName")\
 .drop("reviewTime")
df.count()

                                                                                

296337

In [5]:
df.describe("overall").show()

                                                                                

+-------+------------------+
|summary|           overall|
+-------+------------------+
|  count|            296337|
|   mean| 4.393450699710128|
| stddev|0.9869053992908551|
|    min|               1.0|
|    max|               5.0|
+-------+------------------+



In [6]:
#Bucketize data and create labels 0 if overall rating is in (1.0,2.0), otherwise 1
df1 = df.filter("overall !=3")

splits = [-float("inf"), 4.0, float("inf")]

bucketizer = Bucketizer(splits=splits, inputCol="overall", outputCol="label")

df2= bucketizer.transform(df1)

df2.groupBy("overall","label").count().show()



+-------+-----+------+
|overall|label| count|
+-------+-----+------+
|    2.0|  0.0| 10204|
|    5.0|  1.0|188208|
|    1.0|  0.0|  9045|
|    4.0|  1.0| 64809|
+-------+-----+------+



                                                                                

In [7]:
#take sample to create train and test dataset
fractions = {1.0 : .1, 0.0 : 1.0}
df3 = df2.stat.sampleBy("label", fractions, 36)
df3.groupBy("label").count().show()



+-----+-----+
|label|count|
+-----+-----+
|  0.0|19249|
|  1.0|25224|
+-----+-----+



                                                                                

In [8]:
#Split data as 80-20% Train and Test dataset
splitSeed = 5043
trainingData, testData = df3.randomSplit([0.8, 0.2], splitSeed)

In [9]:
#Tokenize 
tokenizer = RegexTokenizer(inputCol="text",outputCol="reviewTokensUf",pattern="\\s+|[,.()\"]")

remover = StopWordsRemover(stopWords=StopWordsRemover.loadDefaultStopWords("english"),inputCol="reviewTokensUf",outputCol="reviewTokens")

In [10]:
#converts word documents to vectors of token counts
cv = CountVectorizer(inputCol="reviewTokens",outputCol="cv",vocabSize=296337)

In [11]:
#IDF model
idf = IDF(inputCol="cv",outputCol="features")

In [12]:
lr = LogisticRegression(maxIter=100,regParam=0.02,elasticNetParam=0.3)

In [13]:
#Creates a pipeline
steps =  [tokenizer, remover, cv, idf,lr]
pipeline = Pipeline(stages=steps)

In [14]:
model = pipeline.fit(trainingData)

24/11/19 12:26:16 WARN DAGScheduler: Broadcasting large task binary with size 1969.4 KiB
24/11/19 12:26:26 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:26 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/11/19 12:26:26 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/11/19 12:26:27 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:28 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:28 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:29 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:29 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:30 WARN DAGScheduler: Broadcasting large task binary with size 1968.6 KiB
24/11/19 12:26:30 WARN DAGScheduler: Broadcasting large task binary with size

In [15]:
#collecting all metrics
vocabulary = model.stages[2].vocabulary
weights = model.stages[-1].coefficients.toArray()
weights = [float(weight) for weight in weights]

In [16]:
schema = StructType([StructField('word', StringType()),
                     StructField('weight', FloatType())
                     ])
cdf = spark.createDataFrame(zip(vocabulary, weights), schema)

In [17]:
cdf.orderBy(desc("weight")).show(10)

[Stage 130:>                                                        (0 + 2) / 2]

+---------+----------+
|     word|    weight|
+---------+----------+
|    great| 0.5876225|
|   thoses|  0.325535|
|  perfect|0.32343474|
|     easy| 0.2615016|
|   highly|0.25427502|
|     love|0.23299988|
|excellent|0.22146676|
|     nice|0.21586789|
|     good|0.20862874|
|    works|0.20269535|
+---------+----------+
only showing top 10 rows



                                                                                

In [18]:
cdf.orderBy("weight").show(10)

+-------------+-----------+
|         word|     weight|
+-------------+-----------+
|     returned|-0.38842562|
|         poor|-0.33077022|
|      useless|-0.30299458|
|        waste|-0.27846226|
|        broke|-0.26966578|
|         junk| -0.2493974|
|       return|-0.24831308|
|disappointing|-0.22999014|
|    returning|-0.21706156|
| disappointed|-0.21414408|
+-------------+-----------+
only showing top 10 rows



In [19]:
predictions = model.transform(testData)

In [20]:
evaluator = BinaryClassificationEvaluator()  
areaUnderROC = evaluator.evaluate(predictions)

24/11/19 12:27:11 WARN DAGScheduler: Broadcasting large task binary with size 1986.0 KiB
                                                                                

In [21]:
predictions.show()

24/11/19 12:27:21 WARN DAGScheduler: Broadcasting large task binary with size 2003.5 KiB
[Stage 141:>                                                        (0 + 1) / 1]

+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      asin|overall|          reviewText|             summary|unixReviewTime|                text|label|      reviewTokensUf|        reviewTokens|                  cv|            features|       rawPrediction|         probability|prediction|
+----------+-------+--------------------+--------------------+--------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|7245456313|    1.0|I wish I would ha...|Defective - Be Ca...|    1354492800|Defective - Be Ca...|  0.0|[defective, -, be...|[defective, -, ca...|(71899,[0,11,15,1...|(71899,[0,11,15,1...|[1.99668098749145...|[0.88044816229074...|       0.0|
|7245456313|    5.0|I bought thi

                                                                                

In [22]:
#model evaluation
lp = predictions.select("label", "prediction")
counttotal = predictions.count()
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter(~(col("label") == col("prediction"))).count()
ratioWrong = float(wrong) / float(counttotal)
lp = predictions.select(  "prediction","label")
counttotal = float(predictions.count())
correct = lp.filter(col("label") == col("prediction")).count()
wrong = lp.filter("label != prediction").count()
ratioWrong=wrong/counttotal
ratioCorrect=correct/counttotal
trueneg =( lp.filter(col("label") == 0.0).filter(col("label") == col("prediction")).count()) /counttotal
truepos = (lp.filter(col("label") == 1.0).filter(col("label") == col("prediction")).count())/counttotal
falseneg = (lp.filter(col("label") == 0.0).filter(~(col("label") == col("prediction"))).count())/counttotal
falsepos = (lp.filter(col("label") == 1.0).filter(~(col("label") == col("prediction"))).count())/counttotal

precision= truepos / (truepos + falsepos)
recall= truepos / (truepos + falseneg)
#fmeasure= 2  precision  recall / (precision + recall)
accuracy=(truepos + trueneg) / (truepos + trueneg + falsepos + falseneg)

24/11/19 12:27:31 WARN DAGScheduler: Broadcasting large task binary with size 1983.3 KiB
24/11/19 12:27:37 WARN DAGScheduler: Broadcasting large task binary with size 1983.3 KiB
24/11/19 12:27:46 WARN DAGScheduler: Broadcasting large task binary with size 1983.3 KiB
24/11/19 12:27:51 WARN DAGScheduler: Broadcasting large task binary with size 1983.3 KiB
24/11/19 12:27:57 WARN DAGScheduler: Broadcasting large task binary with size 1983.5 KiB
24/11/19 12:28:02 WARN DAGScheduler: Broadcasting large task binary with size 1983.5 KiB
24/11/19 12:28:07 WARN DAGScheduler: Broadcasting large task binary with size 1983.5 KiB
24/11/19 12:28:11 WARN DAGScheduler: Broadcasting large task binary with size 1983.5 KiB
                                                                                

In [23]:
print('counttotal   :', counttotal     )
print('correct      :', correct        )
print('wrong        :', wrong          )
print('ratioWrong   :', ratioWrong     )
print('ratioCorrect :', ratioCorrect   )
print('truen        :', trueneg          )
print('truep        :', truepos          )
print('falsen       :', falseneg         )
print('falsep       :', falsepos         )
print('precision    :', precision      )
print('recall       :', recall         )
#print('fmeasure     :', fmeasure       )
print('accuracy     :', accuracy       )

counttotal   : 9003.0
correct      : 7776
wrong        : 1227
ratioWrong   : 0.13628790403198934
ratioCorrect : 0.8637120959680107
truen        : 0.3361101854937243
truep        : 0.5276019104742864
falsen       : 0.08863712095968011
falsep       : 0.04765078307230923
precision    : 0.9171654759606103
recall       : 0.8561643835616438
accuracy     : 0.8637120959680107


In [24]:
predictions.filter(col("prediction") == 0.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy(desc("rawPrediction")).show(5)

24/11/19 12:28:17 WARN DAGScheduler: Broadcasting large task binary with size 1996.3 KiB
[Stage 162:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|Buyer Beware - Yo...|[buyer, beware, -...|    2.0|       0.0|
|Awful Phone and T...|[awful, phone, te...|    1.0|       0.0|
|DO NOT BUY HERE I...|[buy, need, custo...|    1.0|       0.0|
|                JUNK|[junk, well, rece...|    1.0|       0.0|
|Poor 3-9x40 Hamme...|[poor, 3-9x40, ha...|    1.0|       0.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows



                                                                                

In [25]:
predictions.filter(col("prediction")== 1.0)\
.select("summary","reviewTokens","overall","prediction")\
.orderBy("rawPrediction").show(5)

24/11/19 12:28:24 WARN DAGScheduler: Broadcasting large task binary with size 1996.2 KiB
[Stage 163:>                                                        (0 + 1) / 1]

+--------------------+--------------------+-------+----------+
|             summary|        reviewTokens|overall|prediction|
+--------------------+--------------------+-------+----------+
|My DROID Story an...|[droid, story, co...|    5.0|       1.0|
| great trucker phone|[great, trucker, ...|    5.0|       1.0|
|    Favorite EDC Bag|[favorite, edc, b...|    4.0|       1.0|
|One of My Favorit...|[one, favorites!!...|    4.0|       1.0|
|Best Hopper I've ...|[best, hopper, us...|    4.0|       1.0|
+--------------------+--------------------+-------+----------+
only showing top 5 rows



                                                                                

In [26]:
dir = "sentiment/"
model.write().overwrite().save(dir)

24/11/19 12:28:32 WARN TaskSetManager: Stage 168 contains a task of very large size (1385 KiB). The maximum recommended task size is 1000 KiB.
24/11/19 12:28:34 WARN TaskSetManager: Stage 171 contains a task of very large size (1153 KiB). The maximum recommended task size is 1000 KiB.


In [27]:
dir = "sentiment/"
model = PipelineModel.load(dir)

In [29]:
# Chuyển đổi dữ liệu từ Spark DataFrame sang Pandas
train_df = trainingData.select("text", "label").toPandas()
test_df = testData.select("text", "label").toPandas()

# Tokenizer và Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Cấu hình tokenizer
max_vocab_size = 10000  # Số lượng từ tối đa
max_seq_length = 100    # Chiều dài chuỗi tối đa
tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(train_df['text'])

# Biến đổi văn bản sang chuỗi số
X_train = tokenizer.texts_to_sequences(train_df['text'])
X_test = tokenizer.texts_to_sequences(test_df['text'])

# Padding để có độ dài chuỗi bằng nhau
X_train = pad_sequences(X_train, maxlen=max_seq_length)
X_test = pad_sequences(X_test, maxlen=max_seq_length)

# Nhãn
y_train = train_df['label'].values
y_test = test_df['label'].values

# Xây dựng mô hình LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=max_vocab_size, output_dim=128, input_length=max_seq_length),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Phân loại nhị phân
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Huấn luyện mô hình
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=32)

# Đánh giá mô hình
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


2024-11-19 12:32:05.814470: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-19 12:32:06.792969: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-19 12:32:06.976863: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-19 12:32:06.976969: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.3467799127101898
Test Accuracy: 0.8903698921203613


In [None]:
# Dự đoán kết quả trên tập test
predictions = model.predict(X_test)

# Chuyển đổi xác suất thành nhãn dự đoán (0 hoặc 1)
predicted_labels = (predictions > 0.5).astype("int32")

# In một số dự đoán mẫu
import pandas as pd

# Gộp văn bản, nhãn thực tế và dự đoán vào một DataFrame
results = pd.DataFrame({
    'Text': test_df['text'].values[:10],           # Lấy 10 văn bản đầu tiên
    'Actual Label': y_test[:10],
    'Predicted Label': predicted_labels[:10].flatten(),
    'Prediction Probability': predictions[:10].flatten()
})

print(results)



                                                Text  Actual Label  \
0  Defective - Be Careful! I wish I would have ta...           0.0   
1  Great product, awesome warranty, amazing custo...           1.0   
2  GREAT product for the money I used to be a per...           1.0   
3  Love Love Love the bands My arms are burning a...           1.0   
4  Toddlers love this thing Use this at pre schoo...           1.0   
5  Very Cheaply made product. As I write this rev...           0.0   
6  Really good gift - more for fun than for "prac...           1.0   
7  Always by my side I was given this tool some 1...           1.0   
8  Victorinox Multi-Tool The victor inbox Multi-T...           1.0   
9  Cheap product Cheap product!  Within two days ...           0.0   

   Predicted Label  Prediction Probability  
0                0                0.001845  
1                1                0.883534  
2                1                0.999785  
3                1                0.998045  
4   

In [None]:
dir = "sentiment/"
model.write().overwrite().save(dir)

In [None]:
dir = "sentiment/"
model = PipelineModel.load(dir)

In [28]:
df = spark.read.format("mongo").load()
df.printSchema()

Py4JJavaError: An error occurred while calling o803.load.
: com.mongodb.MongoTimeoutException: Timed out after 30000 ms while waiting to connect. Client view of cluster state is {type=UNKNOWN, servers=[{address=docker_mongo_1:27017, type=UNKNOWN, state=CONNECTING, exception={com.mongodb.MongoSocketException: docker_mongo_1}, caused by {java.net.UnknownHostException: docker_mongo_1}}]
	at com.mongodb.internal.connection.BaseCluster.getDescription(BaseCluster.java:177)
	at com.mongodb.internal.connection.SingleServerCluster.getDescription(SingleServerCluster.java:41)
	at com.mongodb.client.internal.MongoClientDelegate.getConnectedClusterDescription(MongoClientDelegate.java:147)
	at com.mongodb.client.internal.MongoClientDelegate.createClientSession(MongoClientDelegate.java:98)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.getClientSession(MongoClientDelegate.java:278)
	at com.mongodb.client.internal.MongoClientDelegate$DelegateOperationExecutor.execute(MongoClientDelegate.java:182)
	at com.mongodb.client.internal.MongoDatabaseImpl.executeCommand(MongoDatabaseImpl.java:194)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:163)
	at com.mongodb.client.internal.MongoDatabaseImpl.runCommand(MongoDatabaseImpl.java:158)
	at com.mongodb.spark.MongoConnector.$anonfun$hasSampleAggregateOperator$1(MongoConnector.scala:234)
	at com.mongodb.spark.MongoConnector.$anonfun$withDatabaseDo$1(MongoConnector.scala:171)
	at com.mongodb.spark.MongoConnector.withMongoClientDo(MongoConnector.scala:154)
	at com.mongodb.spark.MongoConnector.withDatabaseDo(MongoConnector.scala:171)
	at com.mongodb.spark.MongoConnector.hasSampleAggregateOperator(MongoConnector.scala:234)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator$lzycompute(MongoRDD.scala:221)
	at com.mongodb.spark.rdd.MongoRDD.hasSampleAggregateOperator(MongoRDD.scala:221)
	at com.mongodb.spark.sql.MongoInferSchema$.apply(MongoInferSchema.scala:68)
	at com.mongodb.spark.sql.DefaultSource.constructRelation(DefaultSource.scala:97)
	at com.mongodb.spark.sql.DefaultSource.createRelation(DefaultSource.scala:50)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:339)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:279)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:268)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:268)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:203)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
df = spark.read.format("mongo").load().select("timestamp","text")
splits = [-float("inf"), 0, float("inf")]
#bucketizer = Bucketizer(inputCol="timestamp_ms",outputCol="sentiment",splits=splits)

#df5= bucketizer.transform(df)
predictions = model.transform(df)
predictions.select('text','prediction').show(truncate=False)