In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
SparkContext.setSystemProperty('spark.executor.memory', '16g') 
sc = SparkContext(master='local', appName='NLP_Hotel_Review')

In [4]:
spark = SparkSession(sc)

In [5]:
import pandas as pd
df= pd.read_excel(r'Dataset/tripadvisor_dataset.xlsx', index_col= 0)

In [6]:
df.head()

Unnamed: 0,hotel_name,new_text,sentiment
0,Hotel des Arts Saigon Mgallery,tuyệt_vời trải_nghiệm tuyệt_vời ghé thân_thiện...,1
1,Hotel des Arts Saigon Mgallery,đồng_tiền dịch_vụ phong_cách tận_tâm hơi thích...,1
2,Hotel des Arts Saigon Mgallery,ấn_tượng chú_ý mgalery lướt đắm chìm bình_yên ...,1
3,Hotel des Arts Saigon Mgallery,decor thích ngắm bo tròn thư_thái lắm thượng b...,1
4,Hotel des Arts Saigon Mgallery,ấm cúngks không_lớn lắm trang_trí sang_trọng n...,1


In [7]:
df.count()

hotel_name    78968
new_text      78949
sentiment     78968
dtype: int64

In [8]:
df.isna().any(axis=None)

True

=> Have NaN Values

In [9]:
df = df.dropna()

In [10]:
df.isna().any(axis=None)

False

In [11]:
df = df.drop_duplicates()

In [12]:
df.count()

hotel_name    78947
new_text      78947
sentiment     78947
dtype: int64

In [13]:
df['new_text'] = df['new_text'].astype(str)

In [14]:
data = spark.createDataFrame(df)

In [15]:
data.show(5)

+--------------------+--------------------+---------+
|          hotel_name|            new_text|sentiment|
+--------------------+--------------------+---------+
|Hotel des Arts Sa...|tuyệt_vời trải_ng...|        1|
|Hotel des Arts Sa...|đồng_tiền dịch_vụ...|        1|
|Hotel des Arts Sa...|ấn_tượng chú_ý mg...|        1|
|Hotel des Arts Sa...|decor thích ngắm ...|        1|
|Hotel des Arts Sa...|ấm cúngks không_l...|        1|
+--------------------+--------------------+---------+
only showing top 5 rows



In [16]:
data.printSchema()

root
 |-- hotel_name: string (nullable = true)
 |-- new_text: string (nullable = true)
 |-- sentiment: long (nullable = true)



In [17]:
data.head(3)

[Row(hotel_name='Hotel des Arts Saigon Mgallery', new_text='tuyệt_vời trải_nghiệm tuyệt_vời ghé thân_thiện tươi xanh luông thích trang_phục đẹp lắm lắm đoán thành_phố đẹp bận thèm ngắm zì lắc tinh_tế trắng chủ_đạo cảm_giác dễ_chịu bơi bơi ngắm thành_phố ngắm tuyệt_vời vui_chơi bận quên bơi ngắm bơi club thư_giãn dễ_chịu thích thích quá_trời luông đa_dạng cá_hồi bảo không_gian bầy đẹp cổ_kính bầy trí chi_là', sentiment=1),
 Row(hotel_name='Hotel des Arts Saigon Mgallery', new_text='đồng_tiền dịch_vụ phong_cách tận_tâm hơi thích_hợp chống trải premium tiếc cám_ơn đọc', sentiment=1),
 Row(hotel_name='Hotel des Arts Saigon Mgallery', new_text='ấn_tượng chú_ý mgalery lướt đắm chìm bình_yên thoải_mái đẳng_cấp thân_thiện nở cười tận_tình giúp_đỡ nhẹ kéo tươi_cười auto upgrade m nghỉ đẹp kèm sốc êm họa ấn_tượng thăm ấn_tượng màn_hình chào_mừng kèm cảm_giác hihi thân đón hóa_ra thư_giãn ưa_thích cực toàn đẹp thành gọi_là ngất_ngây dừng type nh', sentiment=1)]

In [18]:
data.count()

78947

In [19]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import isnan, when, count, col, udf

In [20]:
data.drop_duplicates().count()

78947

In [21]:
data = data.drop_duplicates()

In [22]:
data.count()

78947

In [23]:
data = data.withColumn('sentiment', data['sentiment'].cast(DoubleType()))

In [24]:
data.show(5, truncate=True)

+--------------------+--------------------+---------+
|          hotel_name|            new_text|sentiment|
+--------------------+--------------------+---------+
|Hotel des Arts Sa...|tuyệt fong pháp h...|      1.0|
|Fusion Suites Saigon|tốt đẹpgiá tốt gầ...|      1.0|
|Vinpearl Luxury L...|nghỉ tận_tình phụ...|      1.0|
|Liberty Central S...|thôi_nôi gia_đình...|      1.0|
|Silverland Jolie ...|xuất sắcở quen đi...|      1.0|
+--------------------+--------------------+---------+
only showing top 5 rows



In [25]:
from pyspark.sql.functions import length
data = data.withColumn('length',length(data['new_text']))

In [26]:
data.show(5)

+--------------------+--------------------+---------+------+
|          hotel_name|            new_text|sentiment|length|
+--------------------+--------------------+---------+------+
|Hotel des Arts Sa...|tuyệt fong pháp h...|      1.0|   122|
|Fusion Suites Saigon|tốt đẹpgiá tốt gầ...|      1.0|    58|
|Vinpearl Luxury L...|nghỉ tận_tình phụ...|      1.0|   103|
|Liberty Central S...|thôi_nôi gia_đình...|      1.0|   195|
|Silverland Jolie ...|xuất sắcở quen đi...|      1.0|   157|
+--------------------+--------------------+---------+------+
only showing top 5 rows



In [27]:
# Pretty Clear Difference
data.groupby('sentiment').mean().show()

+---------+--------------+------------------+
|sentiment|avg(sentiment)|       avg(length)|
+---------+--------------+------------------+
|      0.0|           0.0| 122.7907918036934|
|      1.0|           1.0|100.16907643691867|
+---------+--------------+------------------+



In [28]:
data.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|      0.0|11859|
|      1.0|67088|
+---------+-----+



### Undersampling

In [29]:
#kiêm tra sự khác biệt dữ liệu giữa các nhóm
df_1 = data.filter(col('sentiment') == 1.0)
df_2 = data.filter(col('sentiment') == 0.0)
ratio = int(df_1.count()/df_2.count())
print("ratio 1/2: {}".format(ratio))

ratio 1/2: 5


In [30]:
from pyspark.sql.functions import col, explode, array, lit

# resample df_1
undersampled_df_1 = df_1.sample(False, 1/ratio)

combined_df = undersampled_df_1.unionAll(df_2)
combined_df.show(10)

+--------------------+--------------------+---------+------+
|          hotel_name|            new_text|sentiment|length|
+--------------------+--------------------+---------+------+
|Khách sạn New W...|biến new world th...|      1.0|   109|
|Khách sạn Happy Land|thân_thiện hữu th...|      1.0|   111|
|OYO 226 Tao Đàn H...|tham_quan di_chuy...|      1.0|    82|
|         CuCu Hostel|sạch_sẽ nhiệt_tìn...|      1.0|    42|
|The Oriental Jade...|tốt dễ_dàng tốt k...|      1.0|   122|
|Khách sạn Mỹ Kinh|tuyệt_vời ởvị_trí...|      1.0|   133|
|Hanoian Central H...|tuyệt_vời thân_th...|      1.0|    73|
|Splendid Boutique...|hoàn giúp lập tổ_...|      1.0|    78|
|Hanoi Golden Char...|sạch tuyệt_vời ng...|      1.0|    84|
|Hanoi Holiday Cen...|tuyệt_vời trị đầu...|      1.0|    61|
+--------------------+--------------------+---------+------+
only showing top 10 rows



In [31]:
combined_df.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|      1.0|13388|
|      0.0|11859|
+---------+-----+



### Feature Transformation


In [32]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer

tokenizer = Tokenizer(inputCol='new_text', outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')
idf = IDF(inputCol="c_vec", outputCol="tf_idf")

In [33]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['tf_idf','tf_idf','length'],
                           outputCol='features')

### Pipeline

In [34]:
from pyspark.ml import Pipeline
data_prep_pipe = Pipeline(stages=[tokenizer, stopremove, count_vec, idf,clean_up])

In [35]:
cleaner = data_prep_pipe.fit(combined_df)

In [36]:
clean_data = cleaner.transform(combined_df)

In [37]:
clean_data = clean_data.select(['sentiment','features'])

In [38]:
clean_data.show(10)

+---------+--------------------+
|sentiment|            features|
+---------+--------------------+
|      1.0|(37595,[12,16,79,...|
|      1.0|(37595,[0,4,5,6,1...|
|      1.0|(37595,[3,21,57,8...|
|      1.0|(37595,[3,4,14,12...|
|      1.0|(37595,[0,3,7,8,1...|
|      1.0|(37595,[0,2,4,5,1...|
|      1.0|(37595,[2,5,8,32,...|
|      1.0|(37595,[0,5,10,22...|
|      1.0|(37595,[2,4,11,18...|
|      1.0|(37595,[0,2,22,88...|
+---------+--------------------+
only showing top 10 rows



In [39]:
(training,testing) = clean_data.randomSplit([0.8,0.2])

In [40]:
training.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|      1.0|10727|
|      0.0| 9515|
+---------+-----+



In [41]:
testing.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
|      1.0| 2661|
|      0.0| 2344|
+---------+-----+



### Naive Bayes Model

In [42]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol='sentiment')

In [43]:
import datetime
t0 = datetime.datetime.now()
print(t0)

2023-05-11 23:54:23.746126


In [44]:
predictor = nb.fit(training)

In [45]:
t1 = datetime.datetime.now()
print(t1-t0)

0:00:08.779030


In [46]:
test_result = predictor.transform(testing)

In [47]:
test_result.show(10)

+---------+--------------------+--------------------+--------------------+----------+
|sentiment|            features|       rawPrediction|         probability|prediction|
+---------+--------------------+--------------------+--------------------+----------+
|      1.0|(37595,[0,1,2,3,4...|[-602.47863951544...|[4.87245675459802...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-1980.0526716707...|[8.90975473288179...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-2918.6152815812...|[8.11620048638001...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-2193.5794212720...|[1.24211028334136...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-1151.3714763356...|[1.71685214813082...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-466.79291762013...|[1.51549829766437...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-4008.7205507964...|[1.20860496569747...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-335.02951593561...|[2.07153220063934...|       1.0|
|      1.0|(37595,[0,1,2,3,4...|[-1309.1322438702...|[

In [48]:
test_result.groupBy('sentiment','prediction').count().show()

+---------+----------+-----+
|sentiment|prediction|count|
+---------+----------+-----+
|      1.0|       1.0| 2386|
|      1.0|       0.0|  275|
|      0.0|       1.0|  312|
|      0.0|       0.0| 2032|
+---------+----------+-----+



In [49]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(labelCol='sentiment')
acc = acc_eval.evaluate(test_result)
print("Accuracy of model at predicting was: {}".format(acc))

Accuracy of model at predicting was: 0.8826556503415159


In [50]:
y_true = test_result.select(['sentiment']).collect()
y_pred = test_result.select(['prediction']).collect()

In [51]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
confusion_matrix(y_true, y_pred)

array([[2032,  312],
       [ 275, 2386]], dtype=int64)

In [52]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.87      0.87      2344
         1.0       0.88      0.90      0.89      2661

    accuracy                           0.88      5005
   macro avg       0.88      0.88      0.88      5005
weighted avg       0.88      0.88      0.88      5005



### Use LogisticRegression

In [53]:
from pyspark.ml.classification import LogisticRegression
logistic = LogisticRegression(labelCol='sentiment', maxIter=20, regParam=0.3, elasticNetParam=0)

In [54]:
t0 = datetime.datetime.now()
print(t0)

2023-05-11 23:55:01.465082


In [55]:
predictor_logistic = logistic.fit(training)

In [56]:
t1 = datetime.datetime.now()
print(t1-t0)

0:00:21.562706


In [57]:
test_result_logistic = predictor_logistic.transform(testing)

In [58]:
test_result_logistic.groupBy('sentiment','prediction').count().show()

+---------+----------+-----+
|sentiment|prediction|count|
+---------+----------+-----+
|      1.0|       1.0| 2462|
|      1.0|       0.0|  199|
|      0.0|       1.0|  360|
|      0.0|       0.0| 1984|
+---------+----------+-----+



In [59]:
acc_logistic = acc_eval.evaluate(test_result_logistic)
print("Accuracy of model at predicting was: {}".format(acc_logistic))

Accuracy of model at predicting was: 0.8879654040463034


In [60]:
y_true_logistic = test_result_logistic.select(['sentiment']).collect()
y_pred_logistic = test_result_logistic.select(['prediction']).collect()

In [61]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score
confusion_matrix(y_true_logistic, y_pred_logistic)

array([[1984,  360],
       [ 199, 2462]], dtype=int64)

In [62]:
print(classification_report(y_true_logistic, y_pred_logistic))

              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88      2344
         1.0       0.87      0.93      0.90      2661

    accuracy                           0.89      5005
   macro avg       0.89      0.89      0.89      5005
weighted avg       0.89      0.89      0.89      5005



### Random Forest

In [63]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="sentiment", featuresCol="features",
                            numTrees = 500, maxDepth = 5, maxBins = 64)

In [64]:
import datetime
t0 = datetime.datetime.now()
print(t0)

2023-05-11 23:55:42.567544


In [65]:
predictor_rf = rf.fit(training)

In [66]:
t1 = datetime.datetime.now()
print(t1-t0)

0:03:04.107057


In [67]:
test_result_rf = predictor_rf.transform(testing)

In [68]:
test_result_rf.groupBy('sentiment','prediction').count().show()

+---------+----------+-----+
|sentiment|prediction|count|
+---------+----------+-----+
|      1.0|       1.0| 2645|
|      1.0|       0.0|   16|
|      0.0|       1.0| 1193|
|      0.0|       0.0| 1151|
+---------+----------+-----+



In [69]:
acc_rf = acc_eval.evaluate(test_result_rf)
print("Accuracy of model at predicting was: {}".format(acc_rf))

Accuracy of model at predicting was: 0.7398261781441474


In [70]:
y_true_rf = test_result_rf.select(['sentiment']).collect()
y_pred_rf = test_result_rf.select(['prediction']).collect()

In [71]:
confusion_matrix(y_true_rf, y_pred_rf)

array([[1151, 1193],
       [  16, 2645]], dtype=int64)

In [72]:
print(classification_report(y_true_rf, y_pred_rf))

              precision    recall  f1-score   support

         0.0       0.99      0.49      0.66      2344
         1.0       0.69      0.99      0.81      2661

    accuracy                           0.76      5005
   macro avg       0.84      0.74      0.73      5005
weighted avg       0.83      0.76      0.74      5005



#### Conclusion

Of the three models above, the Logistic Regression Model has the highest accuracy, the highest F1-score and the training time is not too long

=> Choose LOGISTIC REGRESSION MODEL

The results are more reliable even though the accuracy is a bit lower