# DECISION TREE

Xây dựng Decision Tree model để dự đoán flight delay từ 'mon', 'dom', 'dow', 'carries', 'org', 'km' = mile * 1.60934, 'depart', 'duration'

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 7 - Demo Decision Tree')
ss= SparkSession(sc)

In [3]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter6/flights.csv'
df= ss.read.csv(path, header= True, inferSchema= True)

In [4]:
df.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [6]:
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



## Xác định biến số outcome

In [7]:
# Tạo biến output là biến phân loại với định nghĩa delay nếu thời gian delay >= 15. Loại bỏ các dòng bị missing ở output
df= df.withColumn('delay_label', (df.delay >= 15).cast('integer'))
df= df.dropna(subset= 'delay_label')

In [None]:
df= df.withColumn('laber', (df.delay >= 5).cast('integer'))

In [10]:
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+-----------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|delay_label|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|          1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|          0|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|          0|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+
only showing top 3 rows



## Tạo dữ liệu train và test

In [8]:
train, test= df.randomSplit([0.8, 0.2])

## Tạo Pipeline xử lý dữ liệu

In [29]:
from pyspark.ml.feature import SQLTransformer
col_lst= ['mon', 'dom', 'dow', 'carrier', 'org', 'depart', 'duration', 'delay_label']
select_and_create_km= SQLTransformer(statement= 'SELECT {cols}, round(mile * 1.60934) as km FROM __THIS__'.format(cols= ','.join(col_lst)))

In [30]:
from pyspark.ml.feature import StringIndexer
carrier_str_indexer= StringIndexer(inputCol= 'carrier', outputCol= 'carrier_idx')
org_str_indexer= StringIndexer(inputCol= 'org', outputCol= 'org_idx')

In [31]:
from pyspark.ml.feature import VectorAssembler
vec_assembler= VectorAssembler(inputCols= ['mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'depart', 'duration', 'km'], outputCol= 'features')

In [32]:
from pyspark.ml import Pipeline
pipeline= Pipeline(stages= [select_and_create_km, carrier_str_indexer, org_str_indexer, vec_assembler])

## Xây dựng mô hình Decision Tree

In [33]:
process= pipeline.fit(train)
train= process.transform(train)

In [38]:
from pyspark.ml.classification import DecisionTreeClassifier
decision_tree= DecisionTreeClassifier(featuresCol= 'features', labelCol= 'delay_label')
decision_tree_model= decision_tree.fit(train)

## Đánh giá mô hình

In [65]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
bi_classification_evaluator= BinaryClassificationEvaluator(labelCol= 'delay_label')
mutil_classification_evaluator= MulticlassClassificationEvaluator(labelCol= 'delay_label')

### Đánh giá trên tập train

In [52]:
train_result= decision_tree_model.transform(train)

In [64]:
bi_classification_evaluator.setMetricName('areaUnderROC').evaluate(train_result)

0.63510363891115

In [63]:
mutil_classification_evaluator.setMetricName('accuracy').evaluate(train_result)

0.6353715848445424

### Đánh giá trên tập test

In [51]:
test= process.transform(test)

In [66]:
test_result= decision_tree_model.transform(test)

In [67]:
bi_classification_evaluator.setMetricName('areaUnderROC').evaluate(test_result)

0.6366087149648738

In [68]:
mutil_classification_evaluator.setMetricName('accuracy').evaluate(test_result)

0.6405598888770168