In [79]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql import functions as F
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import confusion_matrix

### Load data

In [3]:
# create spark session
spark = SparkSession.builder \
    .appName("Model Training") \
    .getOrCreate() 

In [28]:
flight_schema = StructType([    
    StructField('YEAR', IntegerType(), False),
    StructField('MONTH', IntegerType(), False),
    StructField('DAY', IntegerType(), False),
    StructField('DAY_OF_WEEK', IntegerType(), False),
    StructField('AIRLINE', StringType(), False),
    StructField('FLIGHT_NUMBER', IntegerType(), False),
    StructField('TAIL_NUMBER', StringType(), False),
    StructField('ORIGIN_AIRPORT', StringType(), False),
    StructField('DESTINATION_AIRPORT', StringType(), False),
    StructField('SCHEDULED_DEPARTURE', IntegerType(), False),
    StructField('DEPARTURE_TIME', IntegerType(), False),
    StructField('DEPARTURE_DELAY', IntegerType(), False),
    StructField('TAXI_OUT', IntegerType(), False),
    StructField('WHEELS_OFF', IntegerType(), False),
    StructField('SCHEDULED_TIME', IntegerType(), False),
    StructField('ELAPSED_TIME', IntegerType(), False),
    StructField('AIR_TIME', IntegerType(), False),
    StructField('DISTANCE', IntegerType(), False),
    StructField('WHEELS_ON', IntegerType(), False),
    StructField('TAXI_IN', IntegerType(), False),
    StructField('SCHEDULED_ARRIVAL', IntegerType(), False),
    StructField('ARRIVAL_TIME', IntegerType(), False),
    StructField('ARRIVAL_DELAY', IntegerType(), False),
    StructField('DIVERTED', IntegerType(), False),
    StructField('CANCELLED', IntegerType(), False),
    StructField('CANCELLATION_REASON', StringType(), False),
    StructField('AIR_SYSTEM_DELAY', IntegerType(), False),
    StructField('SECURITY_DELAY', IntegerType(), False),
    StructField('AIRLINE_DELAY', IntegerType(), False),
    StructField('LATE_AIRCRAFT_DELAY', IntegerType(), False),
    StructField('WEATHER_DELAY', IntegerType(), False)
])

In [29]:
path = 'flights_csv/train'

flight_df = spark.read \
    .format('csv') \
    .option('header', True) \
    .schema(flight_schema) \
    .load(path)

In [30]:
flight_df.printSchema()

root
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- DAY_OF_WEEK: integer (nullable = true)
 |-- AIRLINE: string (nullable = true)
 |-- FLIGHT_NUMBER: integer (nullable = true)
 |-- TAIL_NUMBER: string (nullable = true)
 |-- ORIGIN_AIRPORT: string (nullable = true)
 |-- DESTINATION_AIRPORT: string (nullable = true)
 |-- SCHEDULED_DEPARTURE: integer (nullable = true)
 |-- DEPARTURE_TIME: integer (nullable = true)
 |-- DEPARTURE_DELAY: integer (nullable = true)
 |-- TAXI_OUT: integer (nullable = true)
 |-- WHEELS_OFF: integer (nullable = true)
 |-- SCHEDULED_TIME: integer (nullable = true)
 |-- ELAPSED_TIME: integer (nullable = true)
 |-- AIR_TIME: integer (nullable = true)
 |-- DISTANCE: integer (nullable = true)
 |-- WHEELS_ON: integer (nullable = true)
 |-- TAXI_IN: integer (nullable = true)
 |-- SCHEDULED_ARRIVAL: integer (nullable = true)
 |-- ARRIVAL_TIME: integer (nullable = true)
 |-- ARRIVAL_DELAY: integer (null

In [32]:
flight_df.count()

435991

In [33]:
flight_df.show(5)

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

### CLeaning data

In [34]:
# list of columns deemed to be not useful for our models
removed_columns = ['CANCELLATION_REASON','AIR_SYSTEM_DELAY','SECURITY_DELAY',
                     'AIRLINE_DELAY','LATE_AIRCRAFT_DELAY','WEATHER_DELAY']

In [35]:
# create a function that removes unwated columns
def eliminate_columns(removed_columns, df):
    
    # drop the unwated columns
    df = df.drop(*removed_columns)
    
    # return the modified dataframe
    return df

In [36]:
# apply the function that removes unwated columns
flightsRawDf = eliminate_columns(removed_columns, flight_df)

In [37]:
# drop rows with na values
flightsDf = flightsRawDf.na.drop("any")

# drop rows with null values
flightsDf = flightsDf.dropna("any")

### Training

In [38]:
# label each flight as either not delayed or delayed
def return_label(data):
    
    val_return = None
    
    # condition for a flight being late
    if data > 5:
        
        val_return = 1
    
    # condition for a flight being early
    elif data <= 5:
        
        val_return = 0
    
    # return binary label 
    return val_return

In [39]:
# register the function as UDF
return_label_udf = udf(return_label,IntegerType())

In [40]:
# apply the function to the data frame and create two new columns for the binary labels
flightsDf = (flightsDf
.withColumn('binaryDeptDelay', return_label_udf('DEPARTURE_DELAY'))
.withColumn('binaryArrDelay', return_label_udf('ARRIVAL_DELAY')))

In [41]:
# list of columns used for the ML models
nums_cols = ['MONTH',
            'DAY',
            'DAY_OF_WEEK',
            'FLIGHT_NUMBER',
            'SCHEDULED_DEPARTURE',
            'DEPARTURE_TIME',
            'DEPARTURE_DELAY',
            'TAXI_OUT',
            'WHEELS_OFF',
            'SCHEDULED_TIME',
            'ELAPSED_TIME',
            'AIR_TIME',
            'DISTANCE',
            'WHEELS_ON',
            'TAXI_IN',
            'SCHEDULED_ARRIVAL',
            'ARRIVAL_TIME',
            'ARRIVAL_DELAY',
            'binaryDeptDelay',
            'binaryArrDelay']

In [42]:
my_data = flightsDf.select(*nums_cols)

In [43]:
# columns that are in string format and need to be vectorized
str_cols = ['AIRLINE', 'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT']

In [44]:
# get new column names for the indexing
str_cols_names_index = [x + '_index' for x in str_cols]

In [45]:
# configure index function
df_indexer = StringIndexer(inputCols = str_cols, outputCols = str_cols_names_index)

In [46]:
# get new column names for the vectors
output_cols_ohe = [x + '_vec' for x in str_cols]

In [47]:
# set inputs and outputs
df_encoder = OneHotEncoder(inputCols = str_cols_names_index, outputCols = output_cols_ohe)

In [48]:
# vector columns
vec_cols = ['AIRLINE_vec', 'TAIL_NUMBER_vec', 'ORIGIN_AIRPORT_vec','DESTINATION_AIRPORT_vec']

In [49]:
# create a total of four models, 2 for each target and 2 for each type of model

# decision tree models
dt_bin_dept = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'binaryDeptDelay', maxDepth = 3)
dt_bin_arr = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'binaryArrDelay', maxDepth = 3)

# gradient boost tree models
gbt_bin_dept = GBTClassifier(featuresCol="features", labelCol="binaryDeptDelay", maxIter = 3)
gbt_bin_arr = GBTClassifier(featuresCol="features", labelCol="binaryArrDelay", maxIter = 3)

In [50]:
# combine all the feature columns used for training and testing the models
input_cols = nums_cols + vec_cols

# remove the target columns
input_cols.remove('binaryDeptDelay')
input_cols.remove('binaryArrDelay')
input_cols.remove('DEPARTURE_DELAY')
input_cols.remove('ARRIVAL_DELAY')

In [51]:
# create a vector assembler, used for training and testing the models 
assembler = VectorAssembler(inputCols = input_cols, outputCol = "features")

In [52]:
# create a pipline to perform df_indexer, df_encoder,df_encoder_2, assembler all in one go
pipeline = Pipeline(stages = [df_indexer, df_encoder, assembler])

# fit and transform the bank dataframe
model = pipeline.fit(flightsDf).transform(flightsDf)

In [53]:
# splitting the data into testing and training
# 80 % training and 20 % testing
train, test = model.randomSplit([0.8, 0.2], seed=77)

In [54]:
# model 1 decision tree and arrival delay
dt_bin_arr_model = dt_bin_arr.fit(train)

In [55]:
dt_bin_arr_model.save('models/decisiontree_arrival')

In [56]:
# model 2 decision tree and departure delay
dt_bin_dept_model = dt_bin_dept.fit(train)

In [57]:
dt_bin_arr_model.save('models/decisiontree_departure')

In [58]:
# model 3 gradient boost tree and arrival delay
gbt_bin_arr_model = gbt_bin_arr.fit(train)

In [60]:
dt_bin_arr_model.save('models/gradientboost_arrival')

In [59]:
# model 4 gradient boost tree and departure delay
gbt_bin_dept_model = gbt_bin_dept.fit(train)

In [62]:
dt_bin_arr_model.save('models/gradientboost_departure')

### Evaluation

In [63]:
# make predictions with all four models for all four labels 
dt_arr_pred = dt_bin_arr_model.transform(test)
dt_dept_pred = dt_bin_dept_model.transform(test)
gbt_arr_pred = gbt_bin_arr_model.transform(test)
gbt_dept_pred = gbt_bin_dept_model.transform(test)

In [70]:
dt_arr_pred.select(F.col("binaryArrDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.83779369814689...|
|        0|       0.0|[0.83779369814689...|
|        1|       0.0|[0.70371813680542...|
|        1|       0.0|[0.83779369814689...|
|        1|       1.0|[0.40971299495083...|
|        0|       0.0|[0.83779369814689...|
|        0|       0.0|[0.83779369814689...|
|        0|       0.0|[0.83779369814689...|
|        0|       0.0|[0.70371813680542...|
|        0|       0.0|[0.70371813680542...|
+---------+----------+--------------------+
only showing top 10 rows



In [71]:
dt_dept_pred.select(F.col("binaryDeptDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.83592163381301...|
|        0|       0.0|[0.83592163381301...|
|        1|       0.0|[0.67202622367909...|
|        0|       0.0|[0.83592163381301...|
|        0|       0.0|[0.67202622367909...|
|        0|       0.0|[0.83592163381301...|
|        0|       0.0|[0.83592163381301...|
|        0|       0.0|[0.83592163381301...|
|        0|       0.0|[0.67202622367909...|
|        0|       0.0|[0.67202622367909...|
+---------+----------+--------------------+
only showing top 10 rows



In [72]:
gbt_arr_pred.select(F.col("binaryArrDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.72582457909041...|
|        0|       0.0|[0.75925551885328...|
|        1|       0.0|[0.60415692399430...|
|        1|       0.0|[0.82195780690077...|
|        1|       1.0|[0.31866326616415...|
|        0|       0.0|[0.83405848844321...|
|        0|       0.0|[0.83517412976125...|
|        0|       0.0|[0.83046588977356...|
|        0|       0.0|[0.71551082877338...|
|        0|       0.0|[0.63456306500374...|
+---------+----------+--------------------+
only showing top 10 rows



In [73]:
gbt_dept_pred.select(F.col("binaryDeptDelay").alias("trueLabel"), "prediction", "probability").show(10)

+---------+----------+--------------------+
|trueLabel|prediction|         probability|
+---------+----------+--------------------+
|        0|       0.0|[0.76948765764606...|
|        0|       0.0|[0.85119280675811...|
|        1|       0.0|[0.66155148258362...|
|        0|       0.0|[0.66155148258362...|
|        0|       0.0|[0.69374319039860...|
|        0|       0.0|[0.88339584263577...|
|        0|       0.0|[0.85119280675811...|
|        0|       0.0|[0.77348262313087...|
|        0|       0.0|[0.73102190894230...|
|        0|       0.0|[0.66155148258362...|
+---------+----------+--------------------+
only showing top 10 rows



In [78]:
# accuracy
evaluator1 = MulticlassClassificationEvaluator(labelCol="binaryArrDelay", predictionCol="prediction", metricName="accuracy")
accuracy1 = evaluator1.evaluate(dt_arr_pred)
evaluator2 = MulticlassClassificationEvaluator(labelCol="binaryDeptDelay", predictionCol="prediction", metricName="accuracy")
accuracy2 = evaluator2.evaluate(dt_dept_pred)
evaluator3 = MulticlassClassificationEvaluator(labelCol="binaryArrDelay", predictionCol="prediction", metricName="accuracy")
accuracy3 = evaluator3.evaluate(gbt_arr_pred)
evaluator4 = MulticlassClassificationEvaluator(labelCol="binaryDeptDelay", predictionCol="prediction", metricName="accuracy")
accuracy4 = evaluator4.evaluate(gbt_dept_pred)

print(f"Test Accuracy - Decision Tree - Arrival Delay: {accuracy1:.2f}")
print(f"Test Accuracy - Decision Tree - Departure Delay: {accuracy2:.2f}")
print(f"Test Accuracy - Gradient Boost Tree - Arrival Delay: {accuracy3:.2f}")
print(f"Test Accuracy - Gradient Boost Tree - Departure Delay: {accuracy4:.2f}")

Test Accuracy - Decision Tree - Arrival Delay: 0.76
Test Accuracy - Decision Tree - Departure Delay: 0.75
Test Accuracy - Gradient Boost Tree - Arrival Delay: 0.78
Test Accuracy - Gradient Boost Tree - Departure Delay: 0.78


In [81]:
# confusion matrix
cm1 = confusion_matrix(dt_arr_pred.select("binaryArrDelay").collect(), dt_arr_pred.select("prediction").collect(), )
print("Confusion Matrix 1:")
print(cm1)

cm2 = confusion_matrix(dt_dept_pred.select("binaryDeptDelay").collect(), dt_dept_pred.select("prediction").collect(), )
print("Confusion Matrix 2:")
print(cm2)

cm3 = confusion_matrix(gbt_arr_pred.select("binaryArrDelay").collect(), gbt_arr_pred.select("prediction").collect(), )
print("Confusion Matrix 3:")
print(cm3)

cm4 = confusion_matrix(gbt_dept_pred.select("binaryDeptDelay").collect(), gbt_dept_pred.select("prediction").collect(), )
print("Confusion Matrix 4:")
print(cm4)

Confusion Matrix 1:
[[60108  1970]
 [18553  5204]]
Confusion Matrix 2:
[[61724   961]
 [20796  2354]]
Confusion Matrix 3:
[[60293  1785]
 [16954  6803]]
Confusion Matrix 4:
[[61743   942]
 [17976  5174]]
