# CHAPTER 11 - EXERCISE 2

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 11 - Exercise 2')
ss= SparkSession(sc)

## Nhập dữ liệu

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/LDS9_Data_Day_9_Day_10/Musical_Instruments_5.json'
data= ss.read.json(path)

In [4]:
data.printSchema()

root
 |-- asin: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [5]:
data.show(3)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
only showing top 3 rows



## Mô tả dữ liệu

In [6]:
data.groupBy('overall').count().show()

+-------+-----+
|overall|count|
+-------+-----+
|    1.0|  217|
|    4.0| 2084|
|    3.0|  772|
|    2.0|  250|
|    5.0| 6938|
+-------+-----+



In [9]:
from pyspark.sql.functions import length
data.withColumn('length', length('reviewText'))\
.select('overall', 'length')\
.groupBy('overall').mean().show()

+-------+------------+-----------------+
|overall|avg(overall)|      avg(length)|
+-------+------------+-----------------+
|    1.0|         1.0|539.0829493087558|
|    4.0|         4.0|540.3258157389636|
|    3.0|         3.0|579.2111398963731|
|    2.0|         2.0|          614.032|
|    5.0|         5.0|452.9315364658403|
+-------+------------+-----------------+



## Tạo tập train, test

In [10]:
train, test= data.randomSplit([0.8, 0.2])

## Xử lý dữ liệu

In [101]:
from pyspark.ml.feature import SQLTransformer, RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.feature import IDF, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline

In [92]:
# 1
select_columns= SQLTransformer(statement= 'SELECT overall, reviewText FROM __THIS__')
# 2
sql_query_label= '''
SELECT *, 
        CASE WHEN (overall <= 2) THEN 'Not_like' 
             WHEN (overall >= 4)  THEN 'Like' 
             ELSE 'Neutral' 
        END as label
FROM __THIS__'''
create_label_output= SQLTransformer(statement= sql_query_label)
# 3
regex_tokenizer= RegexTokenizer(inputCol= 'reviewText', outputCol= 'words',
                                pattern= '\\W', toLowercase= True)
# 4
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))
stop_words_remover= StopWordsRemover(inputCol= 'words', outputCol= 'words_filtered')
# 5 TF
count_vectorizer= CountVectorizer(inputCol= 'words_filtered', outputCol= 'tf')
# 6 TF-IDF
tf_idf= IDF(inputCol= 'tf', outputCol= 'tf_idf')
# 7
str_indexer= StringIndexer(inputCol= 'label', outputCol= 'label_idx')


In [93]:
process= Pipeline(stages= [select_columns, # 1
                           create_label_output, # 2
                           regex_tokenizer, # 3
                           stop_words_remover, # 4
                           count_vectorizer, #5
                           tf_idf, # 6
                           str_indexer]) # 7
process_model= process.fit(train)

## Xây dựng mô hình phân loại

In [94]:
train_cleaned= process_model.transform(train)

### NaiveBayes

In [95]:
from pyspark.ml.classification import NaiveBayes
naive_bayes= NaiveBayes(featuresCol= 'tf_idf', labelCol= 'label_idx', predictionCol= 'prediction_idx')

# relabel
label_lis= process_model.stages[6].labels
condition_lis= []
for index, label in zip(range(len(label_lis)), label_lis):
    condition_lis.append("WHEN (prediction_idx == {index}) THEN '{label}'".format(index= index, label= label)) 

sql_query_relabel= '''
SELECT *, CASE {condition}  END as prediction_label
FROM __THIS__
'''.format(condition= ' '.join(condition_lis))

create_label_prediction= SQLTransformer(statement= sql_query_relabel)

naive_bayes_pipe= Pipeline(stages= [process, # 1
                                    naive_bayes, # 2
                                    create_label_prediction]) # 3

naive_bayes_model= naive_bayes_pipe.fit(train)

#### Đánh giá tập train

In [96]:
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator

accuracy= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction_idx',
                                                  metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction_idx',
                                                  metricName= 'f1')

precision= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction_idx',
                                                  metricName= 'weightedPrecision')

recall= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction_idx',
                                                  metricName= 'weightedRecall')
def evaluator(data_result):
    data_result.crosstab(col1= 'prediction_label', col2= 'label').show()
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))
    print('precision:' ,precision.evaluate(data_result))
    print('recall:' ,recall.evaluate(data_result))

In [97]:
train_naive_bayes_result= naive_bayes_model.transform(train)
evaluator(train_naive_bayes_result)

+----------------------+----+-------+--------+
|prediction_label_label|Like|Neutral|Not_like|
+----------------------+----+-------+--------+
|              Not_like| 110|      5|     364|
|                  Like|6809|     75|      20|
|               Neutral| 258|    552|       7|
+----------------------+----+-------+--------+

accuracy: 0.9420731707317073
f1: 0.9450874556874358
precision: 0.9515093842235908
recall: 0.9420731707317074


#### Đánh giá tập test

In [98]:
test_naive_bayes_result= naive_bayes_model.transform(test)
evaluator(test_naive_bayes_result)

+----------------------+----+-------+--------+
|prediction_label_label|Like|Neutral|Not_like|
+----------------------+----+-------+--------+
|              Not_like|  44|     19|      18|
|                  Like|1679|     99|      42|
|               Neutral| 122|     22|      16|
+----------------------+----+-------+--------+

accuracy: 0.834061135371179
f1: 0.8386283445303847
precision: 0.8433780085890712
recall: 0.8340611353711791
