# CHAPTER 11 - EXERCISE 1: HAM VS SPAM

In [2]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc= SparkContext(master= 'local', appName= 'Chapter 11 - Exercise 1: Ham vs Spam')
ss= SparkSession(sc)

## Nhập dữ liệu

In [28]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/LDS9_Data_Day_9_Day_10/smsspamcollection/SMSSpamCollection'
data= ss.read.csv(path, inferSchema= True, sep= '\t').toDF('label', 'text')

In [31]:
data.printSchema()

root
 |-- label: string (nullable = true)
 |-- text: string (nullable = true)



In [30]:
data.show(3)

+-----+--------------------+
|label|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
+-----+--------------------+
only showing top 3 rows



## Mô tả dữ liệu

In [34]:
data.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  ham| 4827|
| spam|  747|
+-----+-----+



In [37]:
from pyspark.sql.functions import length
data.withColumn('length', length('text')).groupBy('label').mean().show()

+-----+-----------------+
|label|      avg(length)|
+-----+-----------------+
|  ham|71.45431945307645|
| spam|138.6706827309237|
+-----+-----------------+



## Tạo tập train và test

In [32]:
train, test= data.randomSplit([0.8, 0.2])

## Xử lý dữ liệu

In [55]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml import Pipeline
# 1
regex_tokenizer= RegexTokenizer(inputCol= 'text', outputCol= 'words',
                                pattern= '\\W', toLowercase= True)
# 2
locale = sc._jvm.java.util.Locale
locale.setDefault(locale.forLanguageTag("en-US"))
stop_words_remover= StopWordsRemover(inputCol= 'words', outputCol= 'words_filtered')

# 3: TF
count_vectorizer= CountVectorizer(inputCol= 'words_filtered', outputCol= 'tf')

# 4
tf_idf= IDF(inputCol= 'tf', outputCol= 'tf_idf')

# 5
str_indexer= StringIndexer(inputCol= 'label', outputCol= 'label_idx')

In [161]:
process= Pipeline(stages=[regex_tokenizer, # 1
                          stop_words_remover, # 2
                          count_vectorizer, # 3
                            tf_idf, # 4
                            str_indexer]) # 5
process_model= process.fit(train)

## Xây dựng mô hình

In [162]:
train_cleaned= process_model.transform(train)

In [163]:
from pyspark.ml.classification import NaiveBayes
naive_bayes= NaiveBayes(featuresCol= 'tf_idf', labelCol= 'label_idx')

naive_bayes_model= naive_bayes.fit(train_cleaned)

## Đánh giá mô hình

In [164]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

area_under_ROC= BinaryClassificationEvaluator(labelCol= 'label_idx', 
                                                  metricName= 'areaUnderROC')

area_under_PR= BinaryClassificationEvaluator(labelCol= 'label_idx',
                                                  metricName= 'areaUnderPR')

accuracy= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction',
                                                  metricName= 'accuracy')
f1= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction',
                                                  metricName= 'f1')

precision= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction',
                                                  metricName= 'weightedPrecision')

recall= MulticlassClassificationEvaluator(labelCol= 'label_idx',
                                                  predictionCol= 'prediction',
                                                  metricName= 'weightedRecall')

In [165]:
def evaluator(data_result):
    data_result.crosstab(col1= 'prediction', col2= 'label_idx').show()
    print('areaUnderROC:' ,area_under_ROC.evaluate(data_result))
    print('areaUnderPR:' ,area_under_PR.evaluate(data_result))
    print('accuracy:' ,accuracy.evaluate(data_result))
    print('f1:' ,f1.evaluate(data_result))
    print('precision:' ,precision.evaluate(data_result))
    print('recall:' ,recall.evaluate(data_result))

### Trên tập train

In [166]:
train_result= naive_bayes_model.transform(train_cleaned)

In [167]:
evaluator(train_result)

+--------------------+----+---+
|prediction_label_idx| 0.0|1.0|
+--------------------+----+---+
|                 1.0|  29|607|
|                 0.0|3855|  3|
+--------------------+----+---+

areaUnderROC: 0.19158337694788202
areaUnderPR: 0.08126654772202971
accuracy: 0.9928793947485536
f1: 0.9929417301943152
precision: 0.993138699791233
recall: 0.9928793947485536


### Trên tập test

In [169]:
test_cleaned= process_model.transform(test)
test_result= naive_bayes_model.transform(test_cleaned)

In [170]:
evaluator(test_result)

+--------------------+---+---+
|prediction_label_idx|0.0|1.0|
+--------------------+---+---+
|                 1.0| 17|132|
|                 0.0|926|  5|
+--------------------+---+---+

areaUnderROC: 0.20756863868226128
areaUnderPR: 0.07641802632099842
accuracy: 0.9796296296296296
f1: 0.9799917600878113
precision: 0.9808376673013096
recall: 0.9796296296296296
