# RANDOM FOREST

Xây dựng Random Forest model để dự đoán flight delay từ 'mon', 'dom', 'dow', 'carries', 'org', 'km' = mile * 1.60934, 'depart', 'duration'

In [1]:
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [2]:
sc= SparkContext(master= 'local', appName= 'Chapter 7 - Demo Random Forest')
ss= SparkSession(sc)

In [11]:
path= '/Users/vovanthuong/Desktop/9 - Big Data in Machine Learning/Data/Chapter6/flights.csv'
df= ss.read.csv(path, header= True, inferSchema= True)

In [4]:
df.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [5]:
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351|   NA|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 3 rows



## Xác định biến số outcome

In [12]:
# Tạo biến output là biến phân loại với định nghĩa delay nếu thời gian delay >= 15. Loại bỏ các dòng bị missing ở output
df= df.withColumn('delay_label', (df.delay >= 15).cast('integer'))
df= df.dropna(subset= 'delay_label')
df.show(3)

+---+---+---+-------+------+---+----+------+--------+-----+-----------+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|delay_label|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|          1|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|          0|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|          0|
+---+---+---+-------+------+---+----+------+--------+-----+-----------+
only showing top 3 rows



## Tạo tập train và test

In [8]:
train, test= df.randomSplit([0.8, 0.2])

## Tạo Pipeline xử lý dữ liệu

In [18]:
from pyspark.ml.feature import SQLTransformer, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

In [13]:
col_lst= ['mon', 'dom', 'dow', 'carrier', 'org', 'depart', 'duration']
select_columns= SQLTransformer(statement= 'SELECT {cols_input}, {output} FROM __THIS__'.format(cols_input= ','.join(col_lst), output= 'delay_label'))

In [16]:
carrier_string_indexer= StringIndexer(inputCol= 'carrier', outputCol= 'carrier_idx')
org_string_indexer= StringIndexer(inputCol= 'org', outputCol= 'org_idx')

In [17]:
vector_assembler= VectorAssembler(inputCols= ['mon', 'dom', 'dow', 'carrier_idx', 'org_idx', 'depart', 'duration'], outputCol= 'features')

In [19]:
pipe_line= Pipeline(stages= [select_columns, carrier_string_indexer, org_string_indexer, vector_assembler])

In [20]:
pipe_line_model= pipe_line.fit(train)

## Xây dựng mô hình Random Fosest

In [23]:
train_cleaned= pipe_line_model.transform(train)

In [31]:
from pyspark.ml.classification import RandomForestClassifier
rf= RandomForestClassifier(featuresCol= 'features', labelCol= 'delay_label')
rf_model= rf.fit(train_cleaned)

## Đánh giá mô hình

In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
bi_classification_evaluator= BinaryClassificationEvaluator(labelCol= 'delay_label')
mutil_classification_evaluator= MulticlassClassificationEvaluator(labelCol= 'delay_label')

### Với tập train

In [32]:
train_result= rf_model.transform(train_cleaned)

In [47]:
train_result.groupBy('delay_label', 'prediction').count().show()

+-----------+----------+-----+
|delay_label|prediction|count|
+-----------+----------+-----+
|          1|       0.0| 6508|
|          0|       0.0|11231|
|          1|       1.0|12773|
|          0|       1.0| 7130|
+-----------+----------+-----+



In [37]:
mutil_classification_evaluator.setMetricName('accuracy').evaluate(train_result)

0.637691939854418

In [35]:
bi_classification_evaluator.setMetricName('areaUnderROC').evaluate(train_result)

0.6890839734532368

### Với tập test

In [39]:
test_cleaned= pipe_line_model.transform(test)

In [42]:
test_result= rf_model.transform(test_cleaned)

In [48]:
test_result.groupBy('delay_label', 'prediction').count().show()

+-----------+----------+-----+
|delay_label|prediction|count|
+-----------+----------+-----+
|          1|       0.0| 1619|
|          0|       0.0| 2855|
|          1|       1.0| 3143|
|          0|       1.0| 1763|
+-----------+----------+-----+



In [43]:
mutil_classification_evaluator.setMetricName('accuracy').evaluate(test_result)

0.6394456289978678

In [44]:
bi_classification_evaluator.setMetricName('areaUnderROC').evaluate(test_result)

0.6904382473199338