# CHAPTER 8 - EXCERCISE 2

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 8 - Exercise 2')
ss= SparkSession(sc)

## Nhập dữ liệu

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter6/dog_food.csv'
df= ss.read.csv(path, header= True, inferSchema= True)

In [5]:
df.count()

490

In [4]:
df.show(3)

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
+---+---+----+---+-------+
only showing top 3 rows



In [6]:
df.groupBy('Spoiled').count().show()

+-------+-----+
|Spoiled|count|
+-------+-----+
|    0.0|  350|
|    1.0|  140|
+-------+-----+



## Kiểm tra dữ liệu

In [7]:
df.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



In [8]:
from pyspark.sql.functions import count, when, isnan, isnull, col
nan_data= df.select([count(when(isnan(c), c)).alias(c + '_nan') for c in df.columns]).toPandas().T
nan_data

Unnamed: 0,0
A_nan,0
B_nan,0
C_nan,0
D_nan,0
Spoiled_nan,0


In [9]:
null_data= df.select([count(when(isnull(c), c)).alias(c + '_null') for c in df.columns]).toPandas().T
null_data

Unnamed: 0,0
A_null,0
B_null,0
C_null,0
D_null,0
Spoiled_null,0


## Tách tập train và test

In [10]:
train, test= df.randomSplit([0.8, 0.2])

## Tạo model

In [13]:
from pyspark.ml.feature import VectorAssembler
input_cols= ['A', 'B', 'C', 'D']
vector_assembler= VectorAssembler(inputCols= input_cols, outputCol= 'features')
train_cleaned= vector_assembler.transform(train)

In [14]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [25]:
# DecisionTreeClassifier
dt_classifier= DecisionTreeClassifier(featuresCol= 'features', labelCol= 'Spoiled')
dt_classifier_model= dt_classifier.fit(train_cleaned)

# RandomForestClassifier
rf_classifier= RandomForestClassifier(featuresCol= 'features', labelCol= 'Spoiled', numTrees= 30)
rf_classifier_model= rf_classifier.fit(train_cleaned)

# GBTClassifier
gbt_classifier= GBTClassifier(featuresCol= 'features',labelCol= 'Spoiled')
gbt_classifier_model= gbt_classifier.fit(train_cleaned)

## Đánh giá model

In [28]:
def model_evaluator(data):
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
    mutil_classification_evaluator= MulticlassClassificationEvaluator(labelCol= 'Spoiled')
    accuracy= mutil_classification_evaluator.setMetricName('accuracy')
    f1= mutil_classification_evaluator.setMetricName('f1')
    bi_classification_evaluator= BinaryClassificationEvaluator(labelCol= 'Spoiled')
    auc= bi_classification_evaluator.setMetricName('areaUnderROC')
    for model in [dt_classifier_model, rf_classifier_model, gbt_classifier_model]:
        print(model)
        data_result= model.transform(data)
        data_result.groupBy('Spoiled', 'prediction').count().show()
        print('Accuracy: ', accuracy.evaluate(data_result))
        print('F1: ', f1.evaluate(data_result))
        print('AUC: ', auc.evaluate(data_result))
        print()

### Trên tập train

In [29]:
model_evaluator(train_cleaned)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_8ecd95221e96) of depth 5 with 17 nodes
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|  109|
|    0.0|       1.0|    1|
|    1.0|       0.0|    4|
|    0.0|       0.0|  288|
+-------+----------+-----+

Accuracy:  0.9875106380372476
F1:  0.9875106380372476
AUC:  0.9804636065774566

RandomForestClassificationModel (uid=RandomForestClassifier_be54e7a9b10a) with 30 trees
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|  109|
|    0.0|       1.0|    1|
|    1.0|       0.0|    4|
|    0.0|       0.0|  288|
+-------+----------+-----+

Accuracy:  0.9875106380372476
F1:  0.9875106380372476
AUC:  0.999372263220749

GBTClassificationModel (uid=GBTClassifier_d2a03f3d691d) with 20 trees
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|  113|
|    0.0|       1.0|    1|
|    0.0|      

### Trên tập test

In [30]:
test_cleaned= vector_assembler.transform(test)

In [31]:
model_evaluator(test_cleaned)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_8ecd95221e96) of depth 5 with 17 nodes
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   26|
|    0.0|       1.0|    1|
|    1.0|       0.0|    1|
|    0.0|       0.0|   60|
+-------+----------+-----+

Accuracy:  0.9772727272727273
F1:  0.9772727272727273
AUC:  0.9984820886460231

RandomForestClassificationModel (uid=RandomForestClassifier_be54e7a9b10a) with 30 trees
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   26|
|    1.0|       0.0|    1|
|    0.0|       0.0|   61|
+-------+----------+-----+

Accuracy:  0.9885753531634802
F1:  0.9885753531634802
AUC:  0.9969641772920461

GBTClassificationModel (uid=GBTClassifier_d2a03f3d691d) with 20 trees
+-------+----------+-----+
|Spoiled|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   26|
|    0.0|       1.0|    1|
|    1.0|       0.0|    1|
|    0.0|     

In [35]:
vector_assembler.getInputCols()

['A', 'B', 'C', 'D']

In [32]:
rf_classifier_model.featureImportances

SparseVector(4, {0: 0.0252, 1: 0.0313, 2: 0.9129, 3: 0.0306})