In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import plotly.offline as py


In [18]:
import plotly.express as px

In [2]:
spark = SparkSession.builder.appName("flights").getOrCreate()
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("../../util/schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

23/01/05 11:23:15 WARN Utils: Your hostname, MacBook-Air-di-Teodoro.local resolves to a loopback address: 127.0.0.1; using 192.168.240.184 instead (on interface en0)
23/01/05 11:23:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/05 11:23:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)

In [4]:
# create a new column 'label' that is 1 if the flight is delayed and 0 if it is not
df = df.withColumn("label", when(df["ArrDelay"] > 0, 1).otherwise(0))

In [5]:
features = [
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'Reporting_Airline',
 'Origin',
 'Dest',
 #'DepDelay', # not used in our case
 'CRSDepTime',
 'CRSArrTime',
 'CRSElapsedTime',
 'AirTime',
 'Distance',
 'ORIGIN_STATE',
 'DEST_STATE',
 'label'
 ]


# mantain only the features in features list
df = df.select(features)

In [7]:
# reduce the dataset to 10% of the original size for computational reasons
df = df.sample(False,0.1,seed=42)

In [6]:
# get the number of rows in the dataframe
df.count()

                                                                                

6246739

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorIndexer, StringIndexer,\
                                OneHotEncoder,VectorAssembler
from pyspark.ml.feature import OneHotEncoder

In [9]:
df.dtypes

[('Quarter', 'int'),
 ('Month', 'int'),
 ('DayofMonth', 'int'),
 ('DayOfWeek', 'int'),
 ('Reporting_Airline', 'string'),
 ('Origin', 'string'),
 ('Dest', 'string'),
 ('CRSDepTime', 'int'),
 ('CRSArrTime', 'int'),
 ('CRSElapsedTime', 'double'),
 ('AirTime', 'double'),
 ('Distance', 'double'),
 ('ORIGIN_STATE', 'string'),
 ('DEST_STATE', 'string'),
 ('label', 'int')]

In [11]:
# filter the categorical columns
categoricalCols = [field for (field, dataType) in df.dtypes if dataType == "string"]

# define column name for the index and the one hot encoded columns
indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

# use the StringIndexer to index the categorical columns
stringIndexer = StringIndexer(inputCols=categoricalCols,
                                outputCols=indexOutputCols,
                                handleInvalid="skip")
# one hot encode the indexed columns
oheEncoder = OneHotEncoder(inputCols=indexOutputCols,
                            outputCols=oheOutputCols)

# get the numeric columns
numericCols = [field for (field, dataType) in df.dtypes
                    if ((dataType == "double" or dataType == "int" ) & (field != "label"))]
                    
# assemble the one hot encoded columns and the numeric columns
assemblerInputs = oheOutputCols + numericCols

# assemble the columns into a vector
vecAssembler = VectorAssembler(inputCols=assemblerInputs,
                        outputCol="features")


In [12]:
# create the pipeline
pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vecAssembler])

# fit the pipeline to the data
pipelineModel = pipeline.fit(df)

# transform the data
df_proc = pipelineModel.transform(df)

                                                                                

In [13]:
# select the features and label columns from the 
# preprocessed data
df_proc = df_proc.select("features","label")

In [14]:
df_proc.show(5)

23/01/05 11:27:22 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------------+-----+
|            features|label|
+--------------------+-----+
|(761,[8,15,389,64...|    0|
|(761,[8,20,340,64...|    0|
|(761,[8,33,339,64...|    1|
|(761,[8,75,330,67...|    1|
|(761,[8,94,330,66...|    0|
+--------------------+-----+
only showing top 5 rows



In [20]:
# count the number of rows with label 1
ones = df_proc.filter(df_proc.label == 1).count()
ones

                                                                                

2501251

In [17]:
# count the number of rows with label 0
zeros = df_proc.filter(df_proc.label == 0).count()
zeros

                                                                                

3745488

In [25]:
# the dataset in unbalanced
fig = px.bar(x=["ones","zeros"],y=[ones,zeros])
fig.update_layout(height=400, width=400)
py.iplot(fig)


In [27]:
sample_rate = ones/zeros
sample_rate

0.6678037681605175

In [28]:
# split the data into train and test
train, test = df_proc.randomSplit([0.8, 0.2], seed=42)


In [29]:
train_1 = train.filter(train.label == 1)
# sample the rows with label 0 to make the dataset balanced
train_0 = train.filter(train.label == 0).sample(False, sample_rate, seed=42)

# merge the two datasets
train = train_1.union(train_0)

In [30]:
# shuffle the dataset
train = train.orderBy(rand())

In [31]:
train.filter(train.label == 1).count()

                                                                                

2001882

In [32]:
train.filter(train.label == 0).count()

                                                                                

2002297

The dataset is now balanced for the training part

In [None]:
# use a grid search to find the best hyperparameters
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2,5,6]) \
    .addGrid(rf.maxBins, [30,60]) \
    .addGrid(rf.numTrees, [30, 60]) \
    .build()

# use the grid with 5-fold cross validation
crossval = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGrid,
                            evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
                            numFolds=5)

# fit the model
cvModel = crossval.fit(train)

# get the best model
bestModel = cvModel.bestModel

# get the best hyperparameters
bestModel.extractParamMap()


The model obtained from the search has: maxDepth = 6, maxBins = 60, numTrees = 60

In [33]:
# create the model
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=60,maxDepth=6,maxBins=60)

In [None]:
# train the model
model = rf.fit(train)

In [35]:
# evaluate the model
predictions = model.transform(train)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

# print the accuracy
print("Accuracy = %g " % accuracy)



Test Error = 0.41831 
Accuracy = 0.58169 


                                                                                

In [36]:
# save the model on disk
model.save("../../models/random_forest_model")

                                                                                

Our model's performance on this task is not as high as we would like, but this is expected given the limited information we had to work with (specifically, we do not have access to departure delay data). Despite this, our results are consistent with those obtained by other researchers working on similar tasks. We believe that by using the full dataset and expanding our grid search, we may be able to improve the model's performance. We hope to continue refining our techniques and exploring new data sources in order to achieve better results in the future.