In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [3]:
sc = SparkContext()

In [4]:
spark = SparkSession(sc)

### Read the dataset

In [5]:
import pandas as pd
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [6]:
df = pd.read_csv('Data_pre.csv', index_col=[0])
df.head()

Unnamed: 0,comment,comment_pre,target
0,chưa ktra nên chưa biết đc đứa nào cũng đc như...,kiểm,2
1,Lời đầu tiên cho phép được xin vì sự sơ suất c...,cho_phép sơ_suất hát rõ_ràng,2
2,1m6 50kg size M khá vừa vặn nhưng hok có cơ nê...,vừa_vặn đỏ đỏ cổ đẹp nhức đẹp ổn,2
3,"Lần đầu mua shop, vải cũg khá ok. hợp vs mùa h...",đầu_shop hợp,2
4,"áo đẹp form đẹp, mọi thứ đều ok, giao hàng nha...",đẹp form đẹp hàng thích,2


In [7]:
# Drop NaN values
df = df.dropna().reset_index(drop=True)

In [8]:
df_schema = StructType([
                        StructField("comment", StringType(), True),
                        StructField("comment_pre", StringType(), True),
                        StructField("target", StringType(), True),
                       ])
data = spark.createDataFrame(df, schema=df_schema)

In [9]:
data.show(10)

+--------------------+--------------------+------+
|             comment|         comment_pre|target|
+--------------------+--------------------+------+
|chưa ktra nên chư...|                kiểm|     2|
|Lời đầu tiên cho ...|cho_phép sơ_suất ...|     2|
|1m6 50kg size M k...|vừa_vặn đỏ đỏ cổ ...|     2|
|Lần đầu mua shop,...|        đầu_shop hợp|     2|
|áo đẹp form đẹp, ...|đẹp form đẹp hàng...|     2|
|Áo rất là đẹp nha...|đẹp chất_lượng dà...|     2|
|một mét sáu mươi ...|sáu_mươi_lăm thân...|     2|
|Ko sao em không b...|            quên học|     2|
|Không có gì để ch...|             chê hợp|     2|
|Nhận hàng gói rất...|gọn_gàng đẹp ôm t...|     2|
+--------------------+--------------------+------+
only showing top 10 rows



In [10]:
data.printSchema()

root
 |-- comment: string (nullable = true)
 |-- comment_pre: string (nullable = true)
 |-- target: string (nullable = true)



In [11]:
data.count()

445354

In [12]:
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.classification import LogisticRegression

In [13]:
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="comment_pre", outputCol="words")

In [14]:
countVectors = CountVectorizer(inputCol="words", outputCol="features")

In [15]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

In [16]:
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, countVectors, label_stringIdx])

In [17]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+--------------------+--------------------+------+--------------------+--------------------+-----+
|             comment|         comment_pre|target|               words|            features|label|
+--------------------+--------------------+------+--------------------+--------------------+-----+
|chưa ktra nên chư...|                kiểm|     2|              [kiểm]| (28925,[122],[1.0])|  0.0|
|Lời đầu tiên cho ...|cho_phép sơ_suất ...|     2|[cho_phép, sơ_suấ...|(28925,[633,666,2...|  0.0|
|1m6 50kg size M k...|vừa_vặn đỏ đỏ cổ ...|     2|[vừa_vặn, đỏ, đỏ,...|(28925,[0,2,91,13...|  0.0|
|Lần đầu mua shop,...|        đầu_shop hợp|     2|     [đầu_shop, hợp]|(28925,[44,6983],...|  0.0|
|áo đẹp form đẹp, ...|đẹp form đẹp hàng...|     2|[đẹp, form, đẹp, ...|(28925,[0,4,21,33...|  0.0|
+--------------------+--------------------+------+--------------------+--------------------+-----+
only showing top 5 rows



### Train test split

In [18]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2])
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 356316
Test Dataset Count: 89038


### Build model with Logistic Regression

In [19]:
start = datetime.now()

In [20]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0).select("comment_pre","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|đẹp đẹp đẹp đẹp đẹp đẹp đẹp...|[0.9998667731479504,4.79142...|  0.0|       0.0|
|đẹp thoáng tốt đẹp thoáng t...|[0.9988660702181836,4.65874...|  0.0|       0.0|
|ổn ổn ổn ổn ổn ổn ổn ổn ổn ...|[0.998860177352765,2.521460...|  0.0|       0.0|
|hàng đẹp sop đẹp sop đẹp so...|[0.9987007645022109,6.05129...|  0.0|       0.0|
|đẹp ưng_ý đẹp ưng_ý hài_lòn...|[0.9986842838113984,5.48669...|  0.0|       0.0|
|ổn ổn ổn ổn ổn ổn ổn ổn ổn ...|[0.9979086658624381,5.29852...|  0.0|       0.0|
|mềm cực mềm cực mềm cực mềm...|[0.9978745133656095,0.00122...|  0.0|       0.0|
|đẹp mát hơi đẹp mát hơi đẹp...|[0.9977432884412953,5.87926...|  0.0|       0.0|
|đẹp không_mỏng đáng_giá ủng...|[0.9969626709755939,0.00136...|  0.0|       0.0|
|ổn ổn ổn ổn ổn ổn ổn ổn ổn 

In [21]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:36.750560


In [22]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lrAccuracy = evaluator.evaluate(predictions)
print(lrAccuracy)

0.8182369192669691


In [23]:
trainingSummary = lrModel.summary

In [24]:
accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("\nAccuracy: %s\n\nFPR: %s\n\nTPR: %s\n\nF-measure: %s\n\nPrecision: %s\n\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))


Accuracy: 0.8729498535008251

FPR: 0.6995033871190769

TPR: 0.8729498535008251

F-measure: 0.8314279021884564

Precision: 0.8708411729252752

Recall: 0.8729498535008251


- Logistic Regression has accuracy of 81%

### Build model with Naive Bayes

In [25]:
start = datetime.now()

In [26]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
nbModel = nb.fit(trainingData)
predictions = nbModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("comment_pre","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|đẹp yêu yêu yêu yêu yêu yêu...|[1.0,3.4961041335706486E-17...|  0.0|       0.0|
|thề đẹp dã í săn mã rẻ đẹp ...|[1.0,2.0141526284388915E-18...|  0.0|       0.0|
|đẹp yêu yêu yêu yêu yêu yêu...|[1.0,9.182706520196624E-19,...|  0.0|       0.0|
|tuyệt_vời tuyệt_vời tuyệt_v...|[1.0,5.7850294481287825E-19...|  0.0|       0.0|
|chất_lượng đẹp yêu yêu yêu ...|[1.0,3.622900912350774E-19,...|  0.0|       0.0|
|yêu yêu thích yêu yêu thích...|[1.0,7.002599871615441E-20,...|  0.0|       0.0|
|ưng xịn xò ưng ủng_hộ xịn x...|[1.0,5.830394750651891E-20,...|  0.0|       0.0|
|yêu yêu yêu yêu yêu yêu yêu...|[1.0,1.07916399417859E-20,7...|  0.0|       0.0|
|yêu yêu yêu yêu yêu yêu yêu...|[1.0,1.07916399417859E-20,7...|  0.0|       0.0|
|ủng_hộ yêu yêu yêu yêu yêu 

In [27]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:24.352937


In [28]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
nbAccuracy = evaluator.evaluate(predictions)
print(nbAccuracy)

0.8628350573584076


- Naive Bayes has accuracy of 86%

### Build model with Random Forest

In [29]:
start = datetime.now()

In [30]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
# Train model with Training Data
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("comment_pre","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|đẹp đẹp cẩn_thận dày_dặn th...|[0.8700988519150176,0.07335...|  0.0|       0.0|
|dày_dặn thích thích thích t...|[0.8685859119206152,0.07372...|  0.0|       0.0|
|cực_kì đẹp rẻ mềm dãn tốt v...|[0.8683324624890859,0.07402...|  0.0|       0.0|
|đẹp hài_lòng thoải_mái giãn...|[0.8675324094698945,0.07484...|  0.0|       0.0|
|ưng đàn_ông xịn mịn yêu mềm...|[0.8675013407901839,0.07472...|  0.0|       0.0|
|yêu ạaa đẹp mát rẻ không_đợ...|[0.866811636660231,0.075148...|  0.0|       0.0|
|xịn xò chất_lượng tốt dày_d...|[0.866626111225374,0.075237...|  0.0|       0.0|
|chất_lượng đồng_tiền ưng má...|[0.8660811186442698,0.07546...|  0.0|       0.0|
|đẹp chất_lượng tốt dãn tập ...|[0.8660783595326348,0.07521...|  0.0|       0.0|
|tốt vô_cùng mềm mịn_mát thí

In [31]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:04:56.503171


In [32]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
rfAccuracy = evaluator.evaluate(predictions)
print(rfAccuracy) 

0.7783746062243122


- Random Forest has accuracy of 77%

### Build model with Decision Tree

In [33]:
start = datetime.now()

In [34]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("comment_pre","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+----------------------------+------------------------------+-----+----------+
|                 comment_pre|                   probability|label|prediction|
+----------------------------+------------------------------+-----+----------+
|                  chất_lượng|[0.8902851210738514,0.06363...|  0.0|       0.0|
|    chất_lượng tuyệt_vời đẹp|[0.8902851210738514,0.06363...|  0.0|       0.0|
|                  chất_lượng|[0.8902851210738514,0.06363...|  0.0|       0.0|
|tốt tiện nhiệt_tình gọn_gàng|[0.8902851210738514,0.06363...|  0.0|       0.0|
|        chất_lượng tuyệt_vời|[0.8902851210738514,0.06363...|  0.0|       0.0|
|                  chất_lượng|[0.8902851210738514,0.06363...|  0.0|       0.0|
|               tuyệt_vời tốt|[0.8902851210738514,0.06363...|  0.0|       0.0|
|                  chất_lượng|[0.8902851210738514,0.06363...|  0.0|       0.0|
|                         đẹp|[0.8902851210738514,0.06363...|  0.0|       0.0|
|                  chất_lượng|[0.8902851210738514,0.

In [35]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:04:19.847811


In [36]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
dtAccuracy = evaluator.evaluate(predictions)
print(dtAccuracy) 

0.8560316851569771


- Decision Tree has accuracy of 85%

### Oversampling

In [37]:
data_a = data.filter(data['target'] == 0)
data_b = data.filter(data['target'] == 1)
data_c = data.filter(data['target'] == 2)


a_count = data_a.count()
b_count = data_b.count()
c_count = data_c.count() 
ratio_a_c = c_count / a_count
ratio_b_c = c_count / b_count

data_a_overampled = data_a.sample(withReplacement=True, fraction=ratio_a_c)
data_b_overampled = data_b.sample(withReplacement=True, fraction=ratio_b_c)
data_new = data_c.unionAll(data_a_overampled).unionAll(data_b_overampled)

In [38]:
data_new.groupBy('target').count().show()

+------+------+
|target| count|
+------+------+
|     2|378476|
|     0|377973|
|     1|378410|
+------+------+



In [39]:
pipelineFit_os = pipeline.fit(data_new)
dataset_os = pipelineFit_os.transform(data_new)
dataset_os.show(5)

+--------------------+--------------------+------+--------------------+--------------------+-----+
|             comment|         comment_pre|target|               words|            features|label|
+--------------------+--------------------+------+--------------------+--------------------+-----+
|chưa ktra nên chư...|                kiểm|     2|              [kiểm]|  (28925,[77],[1.0])|  0.0|
|Lời đầu tiên cho ...|cho_phép sơ_suất ...|     2|[cho_phép, sơ_suấ...|(28925,[419,710,1...|  0.0|
|1m6 50kg size M k...|vừa_vặn đỏ đỏ cổ ...|     2|[vừa_vặn, đỏ, đỏ,...|(28925,[0,3,97,13...|  0.0|
|Lần đầu mua shop,...|        đầu_shop hợp|     2|     [đầu_shop, hợp]|(28925,[65,10267]...|  0.0|
|áo đẹp form đẹp, ...|đẹp form đẹp hàng...|     2|[đẹp, form, đẹp, ...|(28925,[0,7,33,36...|  0.0|
+--------------------+--------------------+------+--------------------+--------------------+-----+
only showing top 5 rows



In [40]:
# Train test split
(trainingData, testData) = dataset_os.randomSplit([0.8, 0.2])
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 907809
Test Dataset Count: 227050


### Logistic Regression with oversampled data

In [41]:
start = datetime.now()

In [42]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0).select("comment_pre","probability","label","prediction")\
.orderBy("probability", ascending=False).show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|đẹp hàng đẹp hàng đẹp hàng ...|[0.9999999999999996,5.46253...|  0.0|       0.0|
|đẹp đẹp đẹp đẹp đẹp đẹp đẹp...|[0.9999999999999996,5.37355...|  0.0|       0.0|
|đẹp đẹp tốt nhiềucười ơn nh...|[0.9999999969980284,2.01424...|  0.0|       0.0|
|đẹp đẹp đẹp đẹp đẹp đẹp đẹp...|[0.9999986272499455,1.21294...|  0.0|       0.0|
|tuyệt_vời tuyệt_vời tuyệt_v...|[0.9999646646899828,1.52591...|  0.0|       0.0|
|đẹp ghé đẹp ghé đẹp ghé đẹp...|[0.9997007057486806,7.39566...|  0.0|       0.0|
|tuyệt_vời rẻ đẹp phục_vụ tố...|[0.9995586269667164,2.57717...|  0.0|       0.0|
|ổn ổn ổn ổn ổn ổn ổn ổn ổn ...|[0.9994085227822389,5.90857...|  0.0|       0.0|
|mát đẹp chuẩn gầy gầy trông...|[0.999095363304112,6.842082...|  0.0|       0.0|
|đẹp mềm_mại xịn sò ổn ủng_h

In [43]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:01:32.910064


In [44]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lrAccuracy = evaluator.evaluate(predictions)
print(lrAccuracy)

0.7710234660608233


- Logistic Regression with oversampled data has accuracy of 77%

### Naive Bayes with oversampled data

In [45]:
start = datetime.now()

In [46]:
nb = NaiveBayes(smoothing=1)
nbModel = nb.fit(trainingData)
predictions = nbModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("comment_pre","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|ưng thích thích thích thích...|[1.0,1.0569712671654745E-16...|  0.0|       0.0|
|lửng ổn thích thích thích t...|[1.0,4.109820975467683E-17,...|  0.0|       0.0|
|chất ổn ủng_hộ cười_cười_cư...|[1.0,2.669231097798073E-17,...|  0.0|       0.0|
|thích thích thích thích thí...|[1.0,1.772752178508203E-17,...|  0.0|       0.0|
|ổn_đáng săn thích thích thí...|[1.0,1.6492714167417618E-17...|  0.0|       0.0|
|okokokok thích thích thích ...|[1.0,1.1068827058355335E-17...|  0.0|       0.0|
|thích thích thích thích thí...|[1.0,9.896738163256082E-18,...|  0.0|       0.0|
|thích thích thích thích thí...|[1.0,9.896738163256082E-18,...|  0.0|       0.0|
|thân_thiện không_thích hơi ...|[1.0,5.746620029882585E-18,...|  0.0|       0.0|
|tuyệt_vời tuyệt_vời tuyệt_v

In [47]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:00:53.080708


In [48]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
nbAccuracy = evaluator.evaluate(predictions)
print(nbAccuracy)

0.7680845818182123


- Naive Bayes with oversampled data has accuracy of 76%

### Decision Tree with oversampled data

In [49]:
start = datetime.now()

In [50]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(trainingData)
predictions = dtModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("comment_pre","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                   comment_pre|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|  chất_lượng tuyệt_vời đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp tốt|[0.9569180163099344,0.02939...|  0.0|       0.0|
|chất_lượng tuyệt_vời đẹp đẹ...|[0.9569180163099344,0.02939...|  0.0|       0.0|
|                       đẹp 

In [51]:
end = datetime.now()
time_taken = end - start
print('Time: ', time_taken) 

Time:  0:10:54.880693


In [52]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
dtAccuracy = evaluator.evaluate(predictions)
print(dtAccuracy) 

0.5744736053797374


- Decision Tree with oversampled data has accuracy of 57%

### To conclude: 
- Models with oversampled data have low accuracy (using LDS9's ML). 
- Naive Bayes with original data has the highest accuracy among LDS9 models but still lower than Decision Tree with oversampled data (89% accuracy) and Logistic Regression with original data (90% accuracy) using LDS6's ML models 
- We'll choose Decision Tree from LDS6 because Logistic Regression from original data has very low recall and f1-score.