In [None]:
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import plotly.offline as py


In [None]:
spark = SparkSession.builder.appName("flights").getOrCreate()
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("../../util/schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

In [None]:
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)

In [None]:
# create a new column 'label' that is 1 if the flight is delayed and 0 if it is not
df = df.withColumn("label", when(df["ArrDelay"] > 0, 1).otherwise(0))

In [None]:
df.columns

In [None]:
features = [
 'Quarter',
 'Month',
 'DayofMonth',
 'DayOfWeek',
 'Reporting_Airline',
 'Origin',
 'Dest',
 #'DepDelay',
 'CRSDepTime',
 'CRSArrTime',
 'CRSElapsedTime',
 'AirTime',
 'Distance',
 'ORIGIN_STATE',
 'DEST_STATE',
 'label'
 ]


# mantain only the features in features list
df = df.select(features)

In [None]:
df = df.sample(False,0.07,seed=42)

In [None]:
df.show(10)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.feature import OneHotEncoder

In [None]:
df.dtypes

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

categoricalCols = [field for (field, dataType) in df.dtypes
if dataType == "string"]

indexOutputCols = [x + "Index" for x in categoricalCols]
oheOutputCols = [x + "OHE" for x in categoricalCols]

stringIndexer = StringIndexer(inputCols=categoricalCols,
                                outputCols=indexOutputCols,
                                handleInvalid="skip")
oheEncoder = OneHotEncoder(inputCols=indexOutputCols,
                            outputCols=oheOutputCols)

numericCols = [field for (field, dataType) in df.dtypes
                    if ((dataType == "double" or dataType == "int" ) & (field != "label"))]

assemblerInputs = oheOutputCols + numericCols

vecAssembler = VectorAssembler(inputCols=assemblerInputs,
                        outputCol="features")


In [None]:
# create the pipeline
pipeline = Pipeline(stages=[stringIndexer, oheEncoder, vecAssembler])

# fit the pipeline to the data
pipelineModel = pipeline.fit(df)

# transform the data
df_proc = pipelineModel.transform(df)

In [None]:
# select the features and label columns
df_proc = df_proc.select("features","label")

In [None]:
# show the first 5 rows
df_proc.show(5)


In [None]:
# count the number of rows with label 1
df_proc.filter(df_proc.label == 1).count()

In [None]:
# count the number of rows with label 0
df_proc.filter(df_proc.label == 0).count()

In [None]:
sample_rate = 2501251/3745488
sample_rate

In [None]:
# split the data into train and test
train, test = df_proc.randomSplit([0.8, 0.2], seed=42)


In [None]:

train_1 = train.filter(train.label == 1)


train_0 = train.filter(train.label == 0).sample(False, sample_rate, seed=42)


# merge the two datasets
train = train_1.union(train_0)

In [None]:
train = train.orderBy(rand())

In [None]:
# count the number of rows with label 1
train.filter(train.label == 1).count()

In [None]:
# count the number of rows with label 0
train.filter(train.label == 0).count()

In [None]:
# use a grid search to find the best hyperparameters
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed=42)
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [2, 4, 6]) \
    .addGrid(rf.maxBins, [20, 60]) \
    .addGrid(rf.numTrees, [20, 60]) \
    .build()

# use the grid with 8-fold cross validation
crossval = CrossValidator(estimator=rf,
                            estimatorParamMaps=paramGrid,
                            evaluator=MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy"),
                            numFolds=5)

# fit the model
cvModel = crossval.fit(train)

# get the best model
bestModel = cvModel.bestModel

# get the best hyperparameters
bestModel.extractParamMap()



In [None]:
# create the model
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=30,maxDepth=10)


In [None]:
# train the model
model = rf.fit(train)

In [None]:
# evaluate the model
predictions = model.transform(train)

# evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

# print the accuracy
print("Accuracy = %g " % accuracy)



Test Error = 0.419311 
Accuracy = 0.580689 

In [None]:
# show the predictions and the true label
predictions.select("prediction", "label").show(10)