In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.sql.functions import lit
from bs4 import BeautifulSoup
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol 
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [3]:
spark = SparkSession.builder.appName('Rating').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 13:04:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Load Data

In [4]:
df1 = spark.read.csv("shopee_final.csv",header=True
                      ,inferSchema=True)

In [5]:
#df1 = df1.withColumn('class',lit('fake'))

In [6]:
df1.show(2)

23/03/23 13:04:40 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , processed_text, class
 Schema: _c0, processed_text, class
Expected: _c0 but found: 
CSV file: file:///Users/tony.ng/Documents/DS-ML/final_project/project1/shopee_final.csv
+---+--------------------+-------+
|_c0|      processed_text|  class|
+---+--------------------+-------+
|  0|miếng dán hơi dầy...|neutral|
|  1|miếng dán tồi bóc...|neutral|
+---+--------------------+-------+
only showing top 2 rows



In [7]:
df1.count()

616517

In [8]:
df1.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- processed_text: string (nullable = true)
 |-- class: string (nullable = true)



In [9]:
df1.groupby('class').count().show()

+--------+------+
|   class| count|
+--------+------+
| neutral|112092|
|not like|186399|
|    like|318026|
+--------+------+



[Stage 6:=====>                                                   (1 + 10) / 11]                                                                                

### Precprocess

In [10]:
df1 = df1.withColumn('length', length(df1['processed_text']))

In [11]:
df1.show(5)

23/03/23 13:04:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , processed_text, class
 Schema: _c0, processed_text, class
Expected: _c0 but found: 
CSV file: file:///Users/tony.ng/Documents/DS-ML/final_project/project1/shopee_final.csv
+---+--------------------+--------+------+
|_c0|      processed_text|   class|length|
+---+--------------------+--------+------+
|  0|miếng dán hơi dầy...| neutral|    28|
|  1|miếng dán tồi bóc...| neutral|    53|
|  2|cường_lực trắng m...|not like|    23|
|  3|hàng cảm_quan đầu...|not like|    63|
|  4|chất_lượng cường_...|not like|    92|
+---+--------------------+--------+------+
only showing top 5 rows



In [12]:
df1.groupBy('class').mean().show()

23/03/23 13:04:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , processed_text, class
 Schema: _c0, processed_text, class
Expected: _c0 but found: 
CSV file: file:///Users/tony.ng/Documents/DS-ML/final_project/project1/shopee_final.csv
+--------+------------------+----------------+
|   class|          avg(_c0)|     avg(length)|
+--------+------------------+----------------+
| neutral|300455.96634014917|37.0210264839508|
|not like| 295933.3021260844|  41.23207932699|
|    like| 318231.5773678882|40.7444218407763|
+--------+------------------+----------------+



In [13]:
data = df1.select('processed_text','class','length')

In [14]:
data.show()

+--------------------+--------+------+
|      processed_text|   class|length|
+--------------------+--------+------+
|miếng dán hơi dầy...| neutral|    28|
|miếng dán tồi bóc...| neutral|    53|
|cường_lực trắng m...|not like|    23|
|hàng cảm_quan đầu...|not like|    63|
|chất_lượng cường_...|not like|    92|
|kính chất_lượng k...|not like|    33|
|kính cường_lực dở...|not like|   118|
|vỡ bắt_làm hoàn t...|not like|    29|
|kính bụi dính kín...|not like|    33|
|đóng hàng cường_l...| neutral|    22|
|hàng cường_lực xi...|not like|    61|
|không_vừa chất_lư...|not like|    24|
|                  bé|not like|     2|
|         đo thử chán|not like|    11|
|   hàng miếng bảo_vệ|not like|    17|
|sản_phẩm miết chặ...|not like|    76|
|chính_xác sản_phẩ...|not like|   138|
|     hàng hãng lưu_ý|not like|    15|
|chất_lượng sản_ph...|not like|   112|
|cường_lực mô_tả b...|not like|    77|
+--------------------+--------+------+
only showing top 20 rows



In [15]:
null_data = data.filter(data['processed_text'].isNull())

In [16]:
null_data.count()

20896

In [17]:
data = data.filter(data['processed_text'].isNotNull())

### Feature & Transform

In [18]:
class BsTextExtractor(Transformer, HasInputCol, HasOutputCol):
    
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(BsTextExtractor, self).__init__() 
        kwargs = self._input_kwargs 
        self.setParams(**kwargs)
        
    @keyword_only
    def setParams(self, inputCol=None, outputCol=None): 
        kwargs = self._input_kwargs
        return self._set(**kwargs)
    def _transform(self, dataset):
        def f(s):
            cleaned_text = BeautifulSoup(s).text 
            return cleaned_text
        t = StringType()
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))

In [19]:
text_extractor = BsTextExtractor(inputCol="processed_text", outputCol="cleaned_text")
tokenizer = Tokenizer(inputCol='cleaned_text', outputCol='token_text')
stopremove= StopWordsRemover(inputCol='token_text', outputCol='stop_tokens')
count_vec = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
ham_spam_to_num = StringIndexer(inputCol='class',outputCol='label', handleInvalid='keep')

In [20]:
clean_up = VectorAssembler(inputCols =['tf_idf','length'],
                           outputCol='features')

In [21]:
data_prep_pipe = Pipeline(stages=[ham_spam_to_num,
                                  text_extractor,
                                  tokenizer,
                                  stopremove,
                                  count_vec,
                                  idf,
                                  clean_up])

In [22]:
cleaner = data_prep_pipe.fit(data)

                                                                                

In [23]:
clean_data = cleaner.transform(data)

In [24]:
clean_data = clean_data.select('label','features')

In [25]:
clean_data.show()

23/03/23 13:05:21 WARN DAGScheduler: Broadcasting large task binary with size 1628.6 KiB


[Stage 26:>                                                         (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  2.0|(55015,[13,75,182...|
|  2.0|(55015,[75,125,18...|
|  1.0|(55015,[11,38,54,...|
|  1.0|(55015,[1,2,23,50...|
|  1.0|(55015,[2,20,44,4...|
|  1.0|(55015,[2,23,216,...|
|  1.0|(55015,[10,21,32,...|
|  1.0|(55015,[5,139,197...|
|  1.0|(55015,[75,112,21...|
|  2.0|(55015,[1,24,37,2...|
|  1.0|(55015,[0,1,93,19...|
|  1.0|(55015,[2,23,150,...|
|  1.0|(55015,[37,55014]...|
|  1.0|(55015,[57,100,45...|
|  1.0|(55015,[1,182,789...|
|  1.0|(55015,[0,14,108,...|
|  1.0|(55015,[0,1,13,40...|
|  1.0|(55015,[1,215,332...|
|  1.0|(55015,[0,1,2,8,2...|
|  1.0|(55015,[5,16,40,5...|
+-----+--------------------+
only showing top 20 rows



                                                                                

In [26]:
clean_data.groupBy('label').count().show()

+-----+------+
|label| count|
+-----+------+
|  0.0|307718|
|  1.0|179611|
|  2.0|108292|
+-----+------+



In [27]:
(train, test) = clean_data.randomSplit([0.7,0.3])

## Buil Model

### Naive Bayes

In [28]:
nb = NaiveBayes()

In [29]:
prediction = nb.fit(train)

23/03/23 13:05:23 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB




23/03/23 13:05:43 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


                                                                                

In [30]:
test_results = prediction.transform(test)

In [31]:
test_results.show()

23/03/23 13:05:44 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB


[Stage 33:>                                                         (0 + 1) / 1]

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(55015,[0,1,2,3,5...|[-163.22332905690...|[1.0,8.0507389199...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-130.15032816998...|[1.0,7.2096319786...|       0.0|
|  0.0|(55015,[0,1,2,3,5...|[-446.81883077448...|[1.0,3.4550345843...|       0.0|
|  0.0|(55015,[0

                                                                                

In [32]:
test_results.groupBy('label', 'prediction').count().show()

23/03/23 13:05:54 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB




23/03/23 13:06:13 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0| 8534|
|  1.0|       1.0|34394|
|  0.0|       1.0| 4562|
|  1.0|       0.0| 4442|
|  2.0|       2.0|15507|
|  2.0|       1.0| 8363|
|  1.0|       2.0|15165|
|  0.0|       0.0|74597|
|  0.0|       2.0|13377|
+-----+----------+-----+



                                                                                

### Report

In [33]:
acc_eva = MulticlassClassificationEvaluator()
acc = acc_eva.evaluate(test_results)
print('Accuracy of model: {}'.format(acc))

23/03/23 13:06:13 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB




Accuracy of model: 0.7067126322912657




### Nha xet:
- Kết quả phân loại chưa thật sự được tôt khi chỉ có 70% độ chính xac

### Logistic Regression

In [34]:
lg = LogisticRegression(maxIter=10, regParam=0.3)

In [35]:
pre_lg = lg.fit(train)

23/03/23 13:06:32 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB




23/03/23 13:06:50 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


                                                                                

23/03/23 13:06:51 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


[Stage 41:>                                                       (0 + 11) / 11]

23/03/23 13:06:53 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/03/23 13:06:53 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS




23/03/23 13:07:10 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:10 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/03/23 13:07:10 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

23/03/23 13:07:10 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:11 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:11 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:11 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:12 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:12 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:12 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:13 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:13 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


[Stage 51:>                                                       (0 + 11) / 11]

23/03/23 13:07:14 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


                                                                                

23/03/23 13:07:15 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:15 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:16 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB




23/03/23 13:07:16 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:17 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:17 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:18 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB




23/03/23 13:07:18 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


                                                                                

23/03/23 13:07:19 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
23/03/23 13:07:19 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


In [36]:
result_lg = pre_lg.transform(test)

In [37]:
result_lg.groupBy('label', 'prediction').count().show()

23/03/23 13:07:20 WARN DAGScheduler: Broadcasting large task binary with size 6.2 MiB




23/03/23 13:07:39 WARN DAGScheduler: Broadcasting large task binary with size 6.2 MiB
+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|16834|
|  1.0|       1.0|39110|
|  0.0|       1.0| 4519|
|  1.0|       0.0|13067|
|  2.0|       2.0| 4004|
|  2.0|       1.0|11566|
|  1.0|       2.0| 1824|
|  0.0|       0.0|86038|
|  0.0|       2.0| 1979|
+-----+----------+-----+



                                                                                

### Report

In [38]:
acc_eva_lg = MulticlassClassificationEvaluator()
acc_lg = acc_eva_lg.evaluate(result_lg)
print('Accuracy of model: {}'.format(acc_lg))

23/03/23 13:07:40 WARN DAGScheduler: Broadcasting large task binary with size 6.2 MiB




Accuracy of model: 0.6790793257049555


                                                                                

#### Nhận xét:
- using pyspark can lift up the computing process
- Naive Bayes Algorithm also given a better results than normal Machine Learning 