In [71]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import plotly.offline as py


In [72]:
import plotly.express as px

In [73]:
spark = SparkSession.builder.appName("flights").getOrCreate()
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("../../util/schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

In [74]:
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)



In [75]:
# create the label column for the delays
df = df.withColumn("label", when(df["ArrDelay"] > 0, 1).otherwise(0))

In [76]:
features = [
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'Reporting_Airline',
 'Origin',
 'Dest',
 #'DepDelay', # not used in our case
 'CRSDepTime',
 'CRSArrTime',
 'CRSElapsedTime',
 'AirTime',
 'Distance',
 'ORIGIN_STATE',
 'DEST_STATE',
 'label'
 ]


df = df.select(features)

In [77]:
# sampling
df = df.sample(False,0.1,seed=42)

In [6]:
df.count()

                                                                                

6246739

In [84]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier,RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorIndexer, StringIndexer,\
                                OneHotEncoder,VectorAssembler
from pyspark.ml.feature import OneHotEncoder



In [9]:
df.dtypes

[('Quarter', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('Reporting_Airline', 'string'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('CRSDepTime', 'int'),
 ('CRSArrTime', 'int'),
 ('CRSElapsedTime', 'double'),
 ('AirTime', 'double'),
 ('Distance', 'double'),
 ('ORIGIN_STATE', 'string'),
 ('DEST_STATE', 'string'),
 ('label', 'int')]

In [79]:
# indexing + ohe for categorical
categoricalCols = [field for (field, dataType) in df.dtypes if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]


stringIndexer = StringIndexer(inputCols=categoricalCols,
                                outputCols=indexOutputCols,
                                handleInvalid="skip")

oheEncoder = OneHotEncoder(inputCols=indexOutputCols,
                            outputCols=oheOutputCols)


numericCols = [field for (field, dataType) in df.dtypes
                    if ((dataType == "double" or dataType == "int" ) & (field != "label"))]
                    

assemblerInputs = oheOutputCols + numericCols


vecAssembler = VectorAssembler(inputCols=assemblerInputs,
                        outputCol="features")


In [80]:
# pipeline

pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vecAssembler])


pipelineModel = pipeline.fit(df)


df_proc = pipelineModel.transform(df)

                                                                                

In [81]:
df_proc = df_proc.select("features","label")

In [14]:
df_proc.show(5)

23/01/05 11:27:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------------+-----+
|            features|label|
+--------------------+-----+
|(761,[8,15,389,64...|    0|
|(761,[8,20,340,64...|    0|
|(761,[8,33,339,64...|    1|
|(761,[8,75,330,67...|    1|
|(761,[8,94,330,66...|    0|
+--------------------+-----+
only showing top 5 rows



Find unbalanced data on label column

In [20]:
ones = df_proc.filter(df_proc.label == 1).count()
ones

                                                                                

2501251

In [17]:
zeros = df_proc.filter(df_proc.label == 0).count()
zeros

                                                                                

3745488

In [25]:
# the dataset in unbalanced
fig = px.bar(x=["ones","zeros"],y=[ones,zeros])
fig.update_layout(height=400, width=400)
py.iplot(fig)


In [27]:
sample_rate = ones/zeros
sample_rate

0.6678037681605175

In [28]:
# train test split
train, test = df_proc.randomSplit([0.8, 0.2], seed=42)


In [29]:
# balancing dataset
train_1 = train.filter(train.label == 1)

train_0 = train.filter(train.label == 0).sample(False, sample_rate, seed=42)

train = train_1.union(train_0)

In [30]:
train = train.orderBy(rand())

Verify wheter the dataset is balanced or no

In [31]:
train.filter(train.label == 1).count()

                                                                                

2001882

In [32]:
train.filter(train.label == 0).count()

                                                                                

2002297

The dataset is now balanced for the training part

In [None]:
# grid search with 5 fold CV
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2,5,6]) \
    .addGrid(rf.maxBins, [30,60]) \
    .addGrid(rf.numTrees, [30, 60]) \
    .build()


crossval = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGrid,
                            evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
                            numFolds=5)


cvModel = crossval.fit(train)
bestModel = cvModel.bestModel
bestModel.extractParamMap()


The model obtained from the search has: maxDepth = 6, maxBins = 60, numTrees = 60

In [33]:
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=60,maxDepth=6,maxBins=60)

In [None]:
model = rf.fit(train)

In [35]:
# evaluation
predictions = model.transform(train)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

print("Accuracy = %g " % accuracy)



Test Error = 0.41831 
Accuracy = 0.58169 


                                                                                

In [36]:
model.save("../../models/random_forest_model")

                                                                                

Our model's performance on this task is not as high as we would like, but this is expected given the limited information we had to work with (specifically, we do not have access to departure delay data). Despite this, our results are consistent with those obtained by other researchers working on similar tasks. We believe that by using the full dataset and expanding our grid search, we may be able to improve the model's performance. We hope to continue refining our techniques and exploring new data sources in order to achieve better results in the future.

In [85]:
# load the model from disk, by doing this we can avoid the training part
model = RandomForestClassificationModel.load("../../models/random_forest_model")


                                                                                

In [86]:
importance = model.featureImportances
feature_names = pipelineModel.stages[-1].getInputCols()

In [95]:
importance_list = importance.toArray().tolist()

In [113]:
# used to reconstruct importance for the one hot encoded features
dict = {
    "Reporting_AirlineOHE":15,
    "OriginOHE":313,
    "DestOHE":313,
    "ORIGIN_STATEOHE":54,
    "DEST_STATEOHE":54,
    "Quarter":1,
    "Month":1,
    "DayofMonth":1,
    "DayOfWeek":1,
    "CRSDepTime":1,
    "CRSArrTime":1,
    "CRSElapsedTime":1,
    "AirTime":1,
    "Distance":1
}

In [120]:
i = 3
agg_feature_importance = {}
for f in feature_names:
    h = 0
    for j in range(dict[f]):
        h += importance_list[i]
        i += 1
    agg_feature_importance[f] = h


In [121]:
# sort by most important feature
agg_feature_importance = {k: v for k, v in sorted(agg_feature_importance.items(), key=lambda item: item[1], reverse=True)}

In [123]:
fig = px.bar(x=list(agg_feature_importance.keys()),y=list(agg_feature_importance.values()))
fig.update_layout(title_text="Feature Importance")
fig.update_xaxes(title_text="Features")
fig.update_yaxes(title_text="Importance")
fig.show()