In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import confusion_matrix

### Load data

In [2]:
# create spark session
spark = SparkSession.builder \
    .appName("Model Training") \
    .getOrCreate() 

In [3]:
flight_schema = StructType([    
    StructField('YEAR', IntegerType(), False),
    StructField('MONTH', IntegerType(), False),
    StructField('DAY', IntegerType(), False),
    StructField('DAY_OF_WEEK', IntegerType(), False),
    StructField('AIRLINE', StringType(), False),
    StructField('FLIGHT_NUMBER', IntegerType(), False),
    StructField('TAIL_NUMBER', StringType(), False),
    StructField('ORIGIN_AIRPORT', StringType(), False),
    StructField('DESTINATION_AIRPORT', StringType(), False),
    StructField('SCHEDULED_DEPARTURE', IntegerType(), False),
    StructField('DEPARTURE_TIME', IntegerType(), False),
    StructField('DEPARTURE_DELAY', IntegerType(), False),
    StructField('TAXI_OUT', IntegerType(), False),
    StructField('WHEELS_OFF', IntegerType(), False),
    StructField('SCHEDULED_TIME', IntegerType(), False),
    StructField('ELAPSED_TIME', IntegerType(), False),
    StructField('AIR_TIME', IntegerType(), False),
    StructField('DISTANCE', IntegerType(), False),
    StructField('WHEELS_ON', IntegerType(), False),
    StructField('TAXI_IN', IntegerType(), False),
    StructField('SCHEDULED_ARRIVAL', IntegerType(), False),
    StructField('ARRIVAL_TIME', IntegerType(), False),
    StructField('ARRIVAL_DELAY', IntegerType(), False),
    StructField('DIVERTED', IntegerType(), False),
    StructField('CANCELLED', IntegerType(), False),
    StructField('CANCELLATION_REASON', StringType(), False),
    StructField('AIR_SYSTEM_DELAY', IntegerType(), False),
    StructField('SECURITY_DELAY', IntegerType(), False),
    StructField('AIRLINE_DELAY', IntegerType(), False),
    StructField('LATE_AIRCRAFT_DELAY', IntegerType(), False),
    StructField('WEATHER_DELAY', IntegerType(), False)
])

In [4]:
path = 'flights_csv/train'

flight_df = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(flight_schema) \
    .load(path)

In [5]:
flight_df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [6]:
flight_df.count()

435991

In [7]:
flight_df.show(5)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

### CLeaning data

In [8]:
# list of columns deemed to be not useful for our models
removed_columns = ['CANCELLATION_REASON','AIR_SYSTEM_DELAY','SECURITY_DELAY',
                     'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

In [9]:
# create a function that removes unwated columns
def eliminate_columns(removed_columns, df):
    
    # drop the unwated columns
    df = df.drop(*removed_columns)
    
    # return the modified dataframe
    return df

In [10]:
# apply the function that removes unwated columns
flightsRawDf = eliminate_columns(removed_columns, flight_df)

In [11]:
# drop rows with na values
flightsDf = flightsRawDf.na.drop("any")

# drop rows with null values
flightsDf = flightsDf.dropna("any")

### Training

In [12]:
# label each flight as either not delayed or delayed
def return_label(data):
    
    val_return = None
    
    # condition for a flight being late
    if data > 5:
        
        val_return = 1
    
    # condition for a flight being early
    elif data <= 5:
        
        val_return = 0
    
    # return binary label 
    return val_return

In [13]:
# register the function as UDF
return_label_udf = udf(return_label,IntegerType())

In [14]:
# apply the function to the data frame and create two new columns for the binary labels
flightsDf = (flightsDf
.withColumn('binaryDeptDelay', return_label_udf('DEPARTURE_DELAY'))
.withColumn('binaryArrDelay', return_label_udf('ARRIVAL_DELAY')))

In [18]:
# splitting the data into testing and training
# 80 % training and 20 % testing
train, test = flightsDf.randomSplit([0.8, 0.2], seed=77)

In [19]:
# list of columns used for the ML models
nums_cols = ['MONTH',
            'DAY',
            'DAY_OF_WEEK',
            'FLIGHT_NUMBER',
            'SCHEDULED_DEPARTURE',
            'DEPARTURE_TIME',
            'DEPARTURE_DELAY',
            'TAXI_OUT',
            'WHEELS_OFF',
            'SCHEDULED_TIME',
            'ELAPSED_TIME',
            'AIR_TIME',
            'DISTANCE',
            'WHEELS_ON',
            'TAXI_IN',
            'SCHEDULED_ARRIVAL',
            'ARRIVAL_TIME',
            'ARRIVAL_DELAY',
            'binaryDeptDelay',
            'binaryArrDelay']

In [20]:
my_data = train.select(*nums_cols)

In [21]:
# columns that are in string format and need to be vectorized
str_cols = ['AIRLINE', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']

In [22]:
# get new column names for the indexing
str_cols_names_index = [x + '_index' for x in str_cols]

In [76]:
# configure index function
df_indexer = StringIndexer(inputCols = str_cols, outputCols = str_cols_names_index).setHandleInvalid("skip")

In [78]:
# get new column names for the vectors
output_cols_ohe = [x + '_vec' for x in str_cols]

In [79]:
# set inputs and outputs
df_encoder = OneHotEncoder(inputCols = str_cols_names_index, outputCols = output_cols_ohe)

In [80]:
# vector columns
vec_cols = ['AIRLINE_vec', 'TAIL_NUMBER_vec', 'ORIGIN_AIRPORT_vec','DESTINATION_AIRPORT_vec']

In [81]:
# create a total of four models, 2 for each target and 2 for each type of model

# decision tree models
dt_bin_dept = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'binaryDeptDelay', maxDepth = 3)
dt_bin_arr = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'binaryArrDelay', maxDepth = 3)

# gradient boost tree models
gbt_bin_dept = GBTClassifier(featuresCol="features", labelCol="binaryDeptDelay", maxIter = 3)
gbt_bin_arr = GBTClassifier(featuresCol="features", labelCol="binaryArrDelay", maxIter = 3)

In [82]:
# combine all the feature columns used for training and testing the models
input_cols = nums_cols + vec_cols

# remove the target columns
input_cols.remove('binaryDeptDelay')
input_cols.remove('binaryArrDelay')
input_cols.remove('DEPARTURE_DELAY')
input_cols.remove('ARRIVAL_DELAY')

In [83]:
# create a vector assembler, used for training and testing the models 
assembler = VectorAssembler(inputCols = input_cols, outputCol = "features")

In [84]:
# create a pipline to perform df_indexer, df_encoder,df_encoder_2, assembler all in one go
pipeline1 = Pipeline(stages = [df_indexer, df_encoder, assembler, dt_bin_arr])
pipeline2 = Pipeline(stages = [df_indexer, df_encoder, assembler, dt_bin_dept])
pipeline3 = Pipeline(stages = [df_indexer, df_encoder, assembler, gbt_bin_arr])
pipeline4 = Pipeline(stages = [df_indexer, df_encoder, assembler, gbt_bin_dept])

In [85]:
# model 1 decision tree and arrival delay
dt_bin_arr_model = pipeline1.fit(train)

In [87]:
dt_bin_arr_model.save('models/decisiontree_arrival')

In [88]:
# model 2 decision tree and departure delay
dt_bin_dept_model = pipeline2.fit(train)

In [89]:
dt_bin_arr_model.save('models/decisiontree_departure')

In [90]:
# model 3 gradient boost tree and arrival delay
gbt_bin_arr_model = pipeline3.fit(train)

In [91]:
dt_bin_arr_model.save('models/gradientboost_arrival')

In [92]:
# model 4 gradient boost tree and departure delay
gbt_bin_dept_model = pipeline4.fit(train)

In [93]:
dt_bin_arr_model.save('models/gradientboost_departure')

### Evaluation

In [94]:
# make predictions with all four models for all four labels 
dt_arr_pred = dt_bin_arr_model.transform(test)
dt_dept_pred = dt_bin_dept_model.transform(test)
gbt_arr_pred = gbt_bin_arr_model.transform(test)
gbt_dept_pred = gbt_bin_dept_model.transform(test)

In [64]:
dt_arr_pred = dt_arr_pred.withColumn("binaryArrDelay", col("binaryArrDelay").cast("double"))
dt_dept_pred = dt_dept_pred.withColumn("binaryDeptDelay",col("binaryDeptDelay").cast("double"))
gbt_arr_pred = gbt_arr_pred.withColumn("binaryArrDelay",col("binaryArrDelay").cast("double"))
gbt_dept_pred = gbt_dept_pred.withColumn("binaryDeptDelay",col("binaryDeptDelay").cast("double"))

In [96]:
dt_arr_pred.select(F.col("binaryArrDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.83846775707005...|
|        0|       0.0|[0.83846775707005...|
|        1|       0.0|[0.70365735671538...|
|        1|       0.0|[0.83846775707005...|
|        1|       1.0|[0.40971299495083...|
|        0|       0.0|[0.83846775707005...|
|        0|       0.0|[0.83846775707005...|
|        0|       0.0|[0.83846775707005...|
|        0|       0.0|[0.70365735671538...|
|        0|       0.0|[0.70365735671538...|
+---------+----------+--------------------+
only showing top 10 rows



In [97]:
dt_dept_pred.select(F.col("binaryDeptDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.83512153563536...|
|        0|       0.0|[0.83512153563536...|
|        1|       0.0|[0.67235782804722...|
|        0|       0.0|[0.83512153563536...|
|        0|       0.0|[0.67235782804722...|
|        0|       0.0|[0.83512153563536...|
|        0|       0.0|[0.83512153563536...|
|        0|       0.0|[0.83512153563536...|
|        0|       0.0|[0.67235782804722...|
|        0|       0.0|[0.67235782804722...|
+---------+----------+--------------------+
only showing top 10 rows



In [98]:
gbt_arr_pred.select(F.col("binaryArrDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.72423757155220...|
|        0|       0.0|[0.76001959300861...|
|        1|       0.0|[0.60371639118539...|
|        1|       0.0|[0.72458858750353...|
|        1|       1.0|[0.31908507202742...|
|        0|       0.0|[0.83573443796068...|
|        0|       0.0|[0.83821880516585...|
|        0|       0.0|[0.83196720464952...|
|        0|       0.0|[0.71514993486589...|
|        0|       0.0|[0.63233097084936...|
+---------+----------+--------------------+
only showing top 10 rows



In [99]:
gbt_dept_pred.select(F.col("binaryDeptDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.76967127519336...|
|        0|       0.0|[0.84947408842368...|
|        1|       0.0|[0.66792923443400...|
|        0|       0.0|[0.66792923443400...|
|        0|       0.0|[0.65219257866957...|
|        0|       0.0|[0.87986448171688...|
|        0|       0.0|[0.84947408842368...|
|        0|       0.0|[0.76967127519336...|
|        0|       0.0|[0.73483413143308...|
|        0|       0.0|[0.66792923443400...|
+---------+----------+--------------------+
only showing top 10 rows



In [100]:
# Evaluate model
evaluator_auc_1 = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='binaryArrDelay')
auc = evaluator_auc_1.evaluate(dt_arr_pred)
print("Area under the curve (AUC) on test data = %g" % auc)

evaluator_auc_2 = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='binaryDeptDelay')
auc = evaluator_auc_2.evaluate(dt_dept_pred)
print("Area under the curve (AUC) on test data = %g" % auc)

evaluator_auc_3 = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='binaryArrDelay')
auc = evaluator_auc_3.evaluate(gbt_arr_pred)
print("Area under the curve (AUC) on test data = %g" % auc)

evaluator_auc_4 = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='binaryDeptDelay')
auc = evaluator_auc_4.evaluate(gbt_dept_pred)
print("Area under the curve (AUC) on test data = %g" % auc)

Area under the curve (AUC) on test data = 0.469191
Area under the curve (AUC) on test data = 0.548493
Area under the curve (AUC) on test data = 0.768001
Area under the curve (AUC) on test data = 0.77125


In [101]:
# confusion matrix
cm1 = confusion_matrix(dt_arr_pred.select("binaryArrDelay").collect(), dt_arr_pred.select("prediction").collect(), )
print("Confusion Matrix 1:")
print(cm1)

cm2 = confusion_matrix(dt_dept_pred.select("binaryDeptDelay").collect(), dt_dept_pred.select("prediction").collect(), )
print("Confusion Matrix 2:")
print(cm2)

cm3 = confusion_matrix(gbt_arr_pred.select("binaryArrDelay").collect(), gbt_arr_pred.select("prediction").collect(), )
print("Confusion Matrix 3:")
print(cm3)

cm4 = confusion_matrix(gbt_dept_pred.select("binaryDeptDelay").collect(), gbt_dept_pred.select("prediction").collect(), )
print("Confusion Matrix 4:")
print(cm4)

Confusion Matrix 1:
[[59915  2149]
 [18394  5352]]
Confusion Matrix 2:
[[61755   915]
 [20755  2385]]
Confusion Matrix 3:
[[60137  1927]
 [16748  6998]]
Confusion Matrix 4:
[[61912   758]
 [19293  3847]]
