In [1]:
import findspark
findspark.init()

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import *

In [4]:
sc = SparkContext(master = 'local', appName = 'project_1')
spark = SparkSession(sc)

In [5]:
spark = SparkSession.builder.appName('shopee').getOrCreate()

In [6]:
data = spark.read.csv('thoi_trang_nam.csv', inferSchema = True, header = True)

In [7]:
data.show(5)

+---+----------+--------------+------------+------------------+------+--------------------+
|_c0|product_id|      category|sub_category|              user|rating|             comment|
+---+----------+--------------+------------+------------------+------+--------------------+
|  0|         0|Thời Trang Nam|    Áo Ba Lỗ|      karmakyun2nd|     5|                kiểm|
|  1|         0|Thời Trang Nam|    Áo Ba Lỗ|  tranquangvinh_vv|     5|cho_phép sơ_suất ...|
|  2|         0|Thời Trang Nam|    Áo Ba Lỗ|nguyenquoctoan2005|     5|vừa_vặn nâu dày đ...|
|  3|         0|Thời Trang Nam|    Áo Ba Lỗ|    nguyenthuyhavi|     5|        đầu_shop hợp|
|  4|         0|Thời Trang Nam|    Áo Ba Lỗ|      luonganh5595|     5|đẹp form đẹp hàng...|
+---+----------+--------------+------------+------------------+------+--------------------+
only showing top 5 rows



### Clean and prepare data

In [8]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- sub_category: string (nullable = true)
 |-- user: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- comment: string (nullable = true)



In [9]:
comment_null_data = data.filter(data.comment.isNull())

In [10]:
comment_null_data.count()

81965

In [11]:
data = data.filter(data.comment.isNotNull())

In [12]:
data.count()

455985

In [13]:
data = data.withColumn('class', when(data.rating >= 4, 'like')
                               .when(data.rating <= 2, 'not_like')
                               .otherwise('neutral'))

In [14]:
data = data.drop('_c0', 'product_id', 'category', 'user')
data.show(5)

+------------+------+--------------------+-----+
|sub_category|rating|             comment|class|
+------------+------+--------------------+-----+
|    Áo Ba Lỗ|     5|                kiểm| like|
|    Áo Ba Lỗ|     5|cho_phép sơ_suất ...| like|
|    Áo Ba Lỗ|     5|vừa_vặn nâu dày đ...| like|
|    Áo Ba Lỗ|     5|        đầu_shop hợp| like|
|    Áo Ba Lỗ|     5|đẹp form đẹp hàng...| like|
+------------+------+--------------------+-----+
only showing top 5 rows



In [15]:
data.show(5)

+------------+------+--------------------+-----+
|sub_category|rating|             comment|class|
+------------+------+--------------------+-----+
|    Áo Ba Lỗ|     5|                kiểm| like|
|    Áo Ba Lỗ|     5|cho_phép sơ_suất ...| like|
|    Áo Ba Lỗ|     5|vừa_vặn nâu dày đ...| like|
|    Áo Ba Lỗ|     5|        đầu_shop hợp| like|
|    Áo Ba Lỗ|     5|đẹp form đẹp hàng...| like|
+------------+------+--------------------+-----+
only showing top 5 rows



In [16]:
data = data.withColumn('length', length(data['comment']))

In [17]:
data.show(10)

+------------+------+--------------------+-----+------+
|sub_category|rating|             comment|class|length|
+------------+------+--------------------+-----+------+
|    Áo Ba Lỗ|     5|                kiểm| like|     4|
|    Áo Ba Lỗ|     5|cho_phép sơ_suất ...| like|    44|
|    Áo Ba Lỗ|     5|vừa_vặn nâu dày đ...| like|    50|
|    Áo Ba Lỗ|     5|        đầu_shop hợp| like|    12|
|    Áo Ba Lỗ|     5|đẹp form đẹp hàng...| like|    23|
|    Áo Ba Lỗ|     5|đẹp chất_lượng dà...| like|    22|
|    Áo Ba Lỗ|     5|  sáu_mươi_lăm luônn| like|    18|
|    Áo Ba Lỗ|     5|            quên học| like|     8|
|    Áo Ba Lỗ|     5|             chê hợp| like|     7|
|    Áo Ba Lỗ|     5|gọn_gàng đẹp ôm kâng| like|    20|
+------------+------+--------------------+-----+------+
only showing top 10 rows



In [18]:
# Pretty Clear Difference
data.groupby('class').mean().show()

+--------+------------------+------------------+
|   class|       avg(rating)|       avg(length)|
+--------+------------------+------------------+
|not_like|1.3522262123000426| 18.58215514427061|
| neutral|               3.0|15.654778101716877|
|    like| 4.862014461683805|20.912028217413194|
+--------+------------------+------------------+



In [19]:
data.groupby('class').count().show()

+--------+------+
|   class| count|
+--------+------+
|not_like| 39821|
| neutral| 30870|
|    like|385294|
+--------+------+



### Feature Transformations

In [20]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
tokenizer = Tokenizer(inputCol = 'comment', outputCol = 'token_text')
stopremove = StopWordsRemover(inputCol = 'token_text', outputCol = 'stop_tokens')
count_vec = CountVectorizer(inputCol = 'stop_tokens', outputCol = 'c_vec')
idf = IDF(inputCol = 'c_vec', outputCol = 'tf_idf')
class_to_num = StringIndexer(inputCol = 'class', outputCol = 'label')

In [21]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [22]:
clean_up = VectorAssembler(inputCols = ['tf_idf', 'length'],
                          outputCol = 'features')

## Pipeline

In [23]:
from pyspark.ml import Pipeline

In [24]:
data_prep_pipe = Pipeline(stages = [class_to_num, 
                                   tokenizer,
                                   stopremove,
                                   count_vec,
                                   idf,
                                   clean_up])

In [25]:
cleaner = data_prep_pipe.fit(data)

In [26]:
clean_data = cleaner.transform(data)

#### Training and Testing data

In [27]:
clean_data = clean_data.select(['label', 'features'])

In [28]:
clean_data.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(29328,[148,29327...|
|  0.0|(29328,[618,1043,...|
|  0.0|(29328,[0,7,12,25...|
|  0.0|(29328,[61,995,29...|
|  0.0|(29328,[0,8,33,50...|
|  0.0|(29328,[0,17,46,2...|
|  0.0|(29328,[1760,2373...|
|  0.0|(29328,[185,192,2...|
|  0.0|(29328,[61,81,293...|
|  0.0|(29328,[0,71,541,...|
+-----+--------------------+
only showing top 10 rows



In [29]:
(training, testing) = clean_data.randomSplit([0.7, 0.3])

## NaiveBayes

In [30]:
from pyspark.ml.classification import NaiveBayes

In [31]:
# Use defauls
nb = NaiveBayes()

In [32]:
nbModel = nb.fit(training)

### Đánh giá kết quả

In [33]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [34]:
# Predict on training data set
nb_train_model= nbModel.transform(training)

In [35]:
classification_evaluator(nb_train_model)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             2.0| 24470| 4712|14380|
|             1.0|  9683|20628| 3411|
|             0.0|235626| 2616| 4015|
+----------------+------+-----+-----+

accuracy: 0.8469460882953987
f1: 0.8655681354901851


In [36]:
y_true = nb_train_model.select(['label']).collect()
y_pred = nb_train_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.87      0.92    269779
         1.0       0.61      0.74      0.67     27956
         2.0       0.33      0.66      0.44     21806

    accuracy                           0.85    319541
   macro avg       0.64      0.76      0.68    319541
weighted avg       0.90      0.85      0.87    319541



#### Đánh giá model dựa trên tập test

In [37]:
# Predict on test data set
nb_test_model= nbModel.transform(testing)

In [38]:
classification_evaluator(nb_test_model)

+----------------+-----+----+----+
|prediction_label|  0.0| 1.0| 2.0|
+----------------+-----+----+----+
|             2.0|13450|2494|5412|
|             1.0| 4426|8117|1752|
|             0.0|97639|1254|1900|
+----------------+-----+----+----+

accuracy: 0.8147518395825394
f1: 0.8419021517752977


In [39]:
y_true = nb_test_model.select(['label']).collect()
y_pred = nb_test_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.85      0.90    115515
         1.0       0.57      0.68      0.62     11865
         2.0       0.25      0.60      0.36      9064

    accuracy                           0.81    136444
   macro avg       0.60      0.71      0.63    136444
weighted avg       0.89      0.81      0.84    136444



## Logistic Regression

### Apply model

In [40]:
from pyspark.ml.classification import LogisticRegression

In [41]:
logistic = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        predictionCol='prediction')

In [42]:
logisticModel = logistic.fit(training)

### Đánh giá kết quả

In [43]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [44]:
# Predict on training data set
logistic_train_model= logisticModel.transform(training)

In [45]:
classification_evaluator(logistic_train_model)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             2.0|  2888|  791| 8140|
|             1.0|  3048|19080| 2651|
|             0.0|263843| 8085|11015|
+----------------+------+-----+-----+

accuracy: 0.9108784162282775
f1: 0.9023764563845696


In [46]:
y_true = logistic_train_model.select(['label']).collect()
y_pred = logistic_train_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95    269779
         1.0       0.77      0.68      0.72     27956
         2.0       0.69      0.37      0.48     21806

    accuracy                           0.91    319541
   macro avg       0.80      0.68      0.72    319541
weighted avg       0.90      0.91      0.90    319541



#### Đánh giá model dựa trên tập test

In [47]:
# Predict on test data set
logistic_test_model= logisticModel.transform(testing)

In [48]:
classification_evaluator(logistic_test_model)

+----------------+------+----+----+
|prediction_label|   0.0| 1.0| 2.0|
+----------------+------+----+----+
|             2.0|  1592| 526|2733|
|             1.0|  1746|7231|1252|
|             0.0|112177|4108|5079|
+----------------+------+----+----+

accuracy: 0.8951731113130662
f1: 0.8848614124761008


In [49]:
y_true = logistic_test_model.select(['label']).collect()
y_pred = logistic_test_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.97      0.95    115515
         1.0       0.71      0.61      0.65     11865
         2.0       0.56      0.30      0.39      9064

    accuracy                           0.90    136444
   macro avg       0.73      0.63      0.66    136444
weighted avg       0.88      0.90      0.88    136444



## Random Forest

In [50]:
from pyspark.ml.classification import RandomForestClassifier

In [51]:
rfc = RandomForestClassifier(featuresCol = 'features',
                            labelCol = 'label',
                            predictionCol = 'prediction')

In [52]:
# Fit the model to the data and call thi rfc_model
rfc_model = rfc.fit(training)

In [53]:
# Find the number of trees and the relative importance of features
print('Number of trees:', rfc_model.getNumTrees)
print('Relative importance of features:', rfc_model.featureImportances)

Number of trees: 20
Relative importance of features: (29328,[0,1,5,6,8,11,18,20,21,23,26,27,35,37,45,54,55,56,57,59,65,72,73,75,76,78,79,81,88,89,93,95,99,103,114,115,119,121,123,126,130,133,136,137,139,158,159,161,166,178,180,182,186,199,209,211,213,217,221,226,228,236,237,238,247,248,259,278,297,310,315,323,343,350,366,367,376,388,406,409,418,422,427,440,448,461,476,479,488,508,513,516,522,528,583,590,596,606,619,634,675,718,720,729,738,813,861,911,916,1019,1035,1077,1080,1206,1226,1294,1352,2062,2285,2359,2370,2582,2995,3340,3524,3796,3815,4238,5215,29327],[0.04774055246652657,1.565701482101951e-06,0.008785277082481422,0.0015072061953582347,0.03840283227496514,0.04718006597924762,0.00046278362696665554,9.197369886901566e-05,0.00029141820468749644,4.3997232483775736e-07,0.022896456216727597,0.016190405369902006,2.9799870316510524e-05,0.04863081924512774,0.0251680836896133,0.0004396924312008289,0.0006193904130989565,0.02642418903759521,1.0610758127643103e-05,0.0001717579764806153,0.04

### Đánh giá kết quả

In [54]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [55]:
# Predict on training data set
rf_train_result= rfc_model.transform(training)

In [56]:
classification_evaluator(rf_train_result)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             0.0|269779|27956|21806|
+----------------+------+-----+-----+

accuracy: 0.844270375319599
f1: 0.7729804438449267


In [57]:
y_true = rf_train_result.select(['label']).collect()
y_pred = rf_train_result.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.84      1.00      0.92    269779
         1.0       0.00      0.00      0.00     27956
         2.0       0.00      0.00      0.00     21806

    accuracy                           0.84    319541
   macro avg       0.28      0.33      0.31    319541
weighted avg       0.71      0.84      0.77    319541



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Đánh giá model dựa trên tập test

In [58]:
# Predict on test data set
rf_test_result= rfc_model.transform(testing)

In [59]:
classification_evaluator(rf_test_result)

+----------------+------+-----+----+
|prediction_label|   0.0|  1.0| 2.0|
+----------------+------+-----+----+
|             0.0|115515|11865|9064|
+----------------+------+-----+----+

accuracy: 0.8466110638796869
f1: 0.7762872296211847


In [61]:
y_true = rf_test_result.select(['label']).collect()
y_pred = rf_test_result.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.85      1.00      0.92    115515
         1.0       0.00      0.00      0.00     11865
         2.0       0.00      0.00      0.00      9064

    accuracy                           0.85    136444
   macro avg       0.28      0.33      0.31    136444
weighted avg       0.72      0.85      0.78    136444



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Resample data

In [62]:
like_df = training.filter(col('label') == 0)
neutral_df = training.filter(col('label') == 1)
not_like_df = training.filter(col('label') == 2)
ratio_1 = int(like_df.count()/neutral_df.count())
ratio_2 = int(like_df.count()/not_like_df.count())
print('ratio like/neutral: {}'.format(ratio_1))
print('ratio like/not_like: {}'.format(ratio_2))

ratio like/neutral: 9
ratio like/not_like: 12


In [63]:
# Resample neutral
a1 = range(ratio_1)
# duplicate the minority rows
oversampled_neutral_df = neutral_df.withColumn('dummy',explode(array([lit(x) for x in a1]))).drop('dummy')
# combine both oversampled minority rows  and previous majority rows
combined_df = like_df.unionAll(oversampled_neutral_df)
combined_df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
+-----+--------------------+
only showing top 10 rows



In [64]:
combined_df.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|269779|
|  1.0|251604|
+-----+------+



In [65]:
# Resample not_like
a2 = range(ratio_2)
# Duplicate the minority rows
oversampled_notlike_df = not_like_df.withColumn('dummy',explode(array([lit(x) for x in a2]))).drop('dummy')
# combine both oversampled minority rows  and previous majority rows
combined_df = combined_df.unionAll(oversampled_notlike_df)
combined_df.show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
|  0.0|(29328,[0,1,2,3,4...|
+-----+--------------------+
only showing top 10 rows



In [66]:
combined_df.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|269779|
|  1.0|251604|
|  2.0|261672|
+-----+------+



## NaiveBayes

In [67]:
from pyspark.ml.classification import NaiveBayes

In [68]:
# Use defauls
nb = NaiveBayes()

In [69]:
nbModel_2 = nb.fit(combined_df)

### Đánh giá kết quả

In [70]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [71]:
# Predict on training data set
nb_train_model= nbModel_2.transform(training)

In [72]:
classification_evaluator(nb_train_model)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             2.0| 36981| 6551|16748|
|             1.0|  9868|19988| 2987|
|             0.0|222930| 1417| 2071|
+----------------+------+-----+-----+

accuracy: 0.812621854472509
f1: 0.8439938347899083


In [73]:
y_true = nb_train_model.select(['label']).collect()
y_pred = nb_train_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.83      0.90    269779
         1.0       0.61      0.71      0.66     27956
         2.0       0.28      0.77      0.41     21806

    accuracy                           0.81    319541
   macro avg       0.62      0.77      0.65    319541
weighted avg       0.90      0.81      0.84    319541



#### Đánh giá model dựa trên tập test

In [74]:
# Predict on test data set
nb_test_model= nbModel_2.transform(testing)

In [75]:
classification_evaluator(nb_test_model)

+----------------+-----+----+----+
|prediction_label|  0.0| 1.0| 2.0|
+----------------+-----+----+----+
|             2.0|16876|3169|6237|
|             1.0| 4431|7795|1625|
|             0.0|94208| 901|1202|
+----------------+-----+----+----+

accuracy: 0.7932924862947436
f1: 0.8292092457230863


In [76]:
y_true = nb_test_model.select(['label']).collect()
y_pred = nb_test_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.82      0.89    115515
         1.0       0.56      0.66      0.61     11865
         2.0       0.24      0.69      0.35      9064

    accuracy                           0.79    136444
   macro avg       0.59      0.72      0.62    136444
weighted avg       0.89      0.79      0.83    136444



## Logistic Regression

### Apply model

In [77]:
from pyspark.ml.classification import LogisticRegression

In [78]:
logistic = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        predictionCol='prediction')

In [79]:
logisticModel_2 = logistic.fit(combined_df)

### Đánh giá kết quả

In [80]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [81]:
# Predict on training data set
logistic_train_model= logisticModel_2.transform(training)

In [82]:
classification_evaluator(logistic_train_model)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             2.0| 28975| 5254|16084|
|             1.0|  8156|21587| 3441|
|             0.0|232648| 1115| 2281|
+----------------+------+-----+-----+

accuracy: 0.8459602993043146
f1: 0.8688448292985071


In [83]:
y_true = logistic_train_model.select(['label']).collect()
y_pred = logistic_train_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.86      0.92    269779
         1.0       0.65      0.77      0.71     27956
         2.0       0.32      0.74      0.45     21806

    accuracy                           0.85    319541
   macro avg       0.65      0.79      0.69    319541
weighted avg       0.91      0.85      0.87    319541



#### Đánh giá model dựa trên tập test

In [84]:
# Predict on test data set
logistic_test_model= logisticModel_2.transform(testing)

In [85]:
classification_evaluator(logistic_test_model)

+----------------+-----+----+----+
|prediction_label|  0.0| 1.0| 2.0|
+----------------+-----+----+----+
|             2.0|13250|2673|5785|
|             1.0| 3967|8206|1792|
|             0.0|98298| 986|1487|
+----------------+-----+----+----+

accuracy: 0.8229676643897863
f1: 0.8497677114428563


In [86]:
y_true = logistic_test_model.select(['label']).collect()
y_pred = logistic_test_model.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.85      0.91    115515
         1.0       0.59      0.69      0.64     11865
         2.0       0.27      0.64      0.38      9064

    accuracy                           0.82    136444
   macro avg       0.61      0.73      0.64    136444
weighted avg       0.89      0.82      0.85    136444



## Random Forest

In [87]:
from pyspark.ml.classification import RandomForestClassifier

In [88]:
rfc = RandomForestClassifier(featuresCol = 'features',
                            labelCol = 'label',
                            predictionCol = 'prediction')

In [89]:
# Fit the model to the data and call thi rfc_model
rfc_model_2 = rfc.fit(combined_df)

In [90]:
# Find the number of trees and the relative importance of features
print('Number of trees:', rfc_model.getNumTrees)
print('Relative importance of features:', rfc_model.featureImportances)

Number of trees: 20
Relative importance of features: (29328,[0,1,5,6,8,11,18,20,21,23,26,27,35,37,45,54,55,56,57,59,65,72,73,75,76,78,79,81,88,89,93,95,99,103,114,115,119,121,123,126,130,133,136,137,139,158,159,161,166,178,180,182,186,199,209,211,213,217,221,226,228,236,237,238,247,248,259,278,297,310,315,323,343,350,366,367,376,388,406,409,418,422,427,440,448,461,476,479,488,508,513,516,522,528,583,590,596,606,619,634,675,718,720,729,738,813,861,911,916,1019,1035,1077,1080,1206,1226,1294,1352,2062,2285,2359,2370,2582,2995,3340,3524,3796,3815,4238,5215,29327],[0.04774055246652657,1.565701482101951e-06,0.008785277082481422,0.0015072061953582347,0.03840283227496514,0.04718006597924762,0.00046278362696665554,9.197369886901566e-05,0.00029141820468749644,4.3997232483775736e-07,0.022896456216727597,0.016190405369902006,2.9799870316510524e-05,0.04863081924512774,0.0251680836896133,0.0004396924312008289,0.0006193904130989565,0.02642418903759521,1.0610758127643103e-05,0.0001717579764806153,0.04

### Đánh giá kết quả

In [91]:
# Function for evaluating classification model
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label', predictionCol= 'prediction', metricName= 'f1')

def classification_evaluator(data_result):
    data_result.crosstab(col1 = 'prediction', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))

#### Đánh giá model dựa trên tập train

In [92]:
# Predict on training data set
rf_train_result= rfc_model_2.transform(training)

In [93]:
classification_evaluator(rf_train_result)

+----------------+------+-----+-----+
|prediction_label|   0.0|  1.0|  2.0|
+----------------+------+-----+-----+
|             2.0| 35397| 8278| 9144|
|             1.0|  3670| 6663| 1310|
|             0.0|230712|13015|11352|
+----------------+------+-----+-----+

accuracy: 0.7714784644224059
f1: 0.7883978374982236


In [94]:
y_true = rf_train_result.select(['label']).collect()
y_pred = rf_train_result.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.86      0.88    269779
         1.0       0.57      0.24      0.34     27956
         2.0       0.17      0.42      0.25     21806

    accuracy                           0.77    319541
   macro avg       0.55      0.50      0.49    319541
weighted avg       0.83      0.77      0.79    319541



#### Đánh giá model dựa trên tập test

In [95]:
# Predict on test data set
rf_test_result= rfc_model_2.transform(testing)

In [96]:
classification_evaluator(rf_test_result)

+----------------+-----+----+----+
|prediction_label|  0.0| 1.0| 2.0|
+----------------+-----+----+----+
|             2.0|15035|3603|3721|
|             1.0| 1553|2699| 532|
|             0.0|98927|5563|4811|
+----------------+-----+----+----+

accuracy: 0.7720896485004837
f1: 0.7890046267281347


In [97]:
y_true = rf_test_result.select(['label']).collect()
y_pred = rf_test_result.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88    115515
         1.0       0.56      0.23      0.32     11865
         2.0       0.17      0.41      0.24      9064

    accuracy                           0.77    136444
   macro avg       0.55      0.50      0.48    136444
weighted avg       0.83      0.77      0.79    136444



## Kết luận:
- Resample data không mang lại kết quả tốt hơn
- Model mang lại kết quả tốt nhất là Logistic Regression (acc = 89%) => lựa chọn model này