In [1]:
import pandas as pd
from zipfile import ZipFile 
pd.set_option("display.max_columns",1000)

import findspark
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
findspark.init()
#findspark.init('C:\Spark\spark-3.1.1-bin-hadoop2.7')
findspark.find()

'C:\\Spark\\spark-3.1.1-bin-hadoop2.7'

## Create Spark df
In order to work with spark within python, we need to create a Spark Session. Here we call the app ' Project1 ' and we assign this to a python object named 'spark'

In [2]:
spark = SparkSession.builder.appName('Project1').getOrCreate()#.master("local") #?

In [3]:
spark.catalog.listTables()

[]

### CSV to python df
We pull in the flights.csv first as a pandas dataframe in python. Within python, we can do some basic cleaning and manipulation for the tasks ahead.

*important*: 

Here, in preparation for task 3, we create a column named 'DEP_HR' which is a string containing just the hour that the flight departed. We will use this as our y variable in our regression model

In [43]:
df = pd.read_csv('flights.csv.zip')
df = df.fillna(0)
df['DEP_TIME'] = df['DEP_TIME'].astype('int')
df['DEP_HR'] = df['DEP_TIME'].astype('str').str[:-2] 
df['ARR_TIME'] = df['ARR_TIME'].astype('int')
df = df.fillna(0)
df = df.astype('str')
df

Unnamed: 0,FL_DATE,TAIL_NUM,CARRIER,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,DEP_DELAY,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 19,DEP_HR
0,2019-01-01,N8974C,9E,AVL,"Asheville, NC",ATL,"Atlanta, GA",1658,-7.0,1758,-22.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
1,2019-01-01,N922XJ,9E,JFK,"New York, NY",RDU,"Raleigh/Durham, NC",1122,-8.0,1255,-29.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
2,2019-01-01,N326PQ,9E,CLE,"Cleveland, OH",DTW,"Detroit, MI",1334,-7.0,1417,-31.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
3,2019-01-01,N135EV,9E,BHM,"Birmingham, AL",ATL,"Atlanta, GA",1059,-1.0,1255,-8.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4,2019-01-01,N914XJ,9E,GTF,"Great Falls, MT",MSP,"Minneapolis, MN",1057,-3.0,1418,-17.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2019-01-01,N257NN,MQ,STL,"St. Louis, MO",ORD,"Chicago, IL",1220,14.0,1327,-7.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
996,2019-01-01,N855AE,MQ,LGA,"New York, NY",CMH,"Columbus, OH",1048,-12.0,1233,-34.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
997,2019-01-01,N688AE,MQ,ORD,"Chicago, IL",COU,"Columbia, MO",2317,52.0,104,80.0,0.0,0,0.0,0.0,0.0,28.0,0.0,52.0,0.0,23
998,2019-01-01,N262NN,MQ,MSN,"Madison, WI",ORD,"Chicago, IL",0,0.0,0,0.0,1.0,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


### Define Schema
for ease of loading into Spark, we make everything a string initially

In [44]:
from pyspark.sql.types import StructType, IntegerType, DateType
from pyspark.sql.types import *

schema = StructType([
    StructField("FL_DATE", StringType()),#, DateType()),
    StructField("TAIL_NUM", StringType()),
    StructField("CARRIER", StringType()),
    StructField("ORIGIN", StringType()),
    StructField("ORIGIN_CITY_NAME", StringType()),
    StructField("DEST", StringType()),
    StructField("DEST_CITY_NAME", StringType()),
    StructField("DEP_TIME", StringType()),
    StructField("DEP_DELAY", StringType()),#, DoubleType()),
    StructField("ARR_TIME", StringType()),
    StructField("ARR_DELAY", StringType()),#, DoubleType()),
    StructField("CANCELLED", StringType()),
    StructField("CANCELLATION_CODE", StringType()),
    StructField("DIVERTED", StringType()),
    StructField("CARRIER_DELAY", StringType()),
    StructField("WEATHER_DELAY", StringType()),
    StructField("NAS_DELAY", StringType()),
    StructField("SECURITY_DELAY", StringType()),
    StructField("LATE_AIRCRAFT_DELAY", StringType()),
    StructField("Unnamed: 19", StringType()),#, IntegerType())
    StructField("DEP_HR", StringType())
])

### Create Spark DF from pandas df
Now we create a Spark dataframe from our pandas dataframe in python. We print the schema just to review it and verify that everything worked correctly

In [45]:
#Create PySpark DataFrame from Pandas
flts=spark.createDataFrame(df,schema=schema) 
flts.printSchema()
flts.show()

root
 |-- FL_DATE: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: string (nullable = true)
 |-- NAS_DELAY: string (nullable = true)
 |-- SECURITY_DELAY: string (nullable = true)
 |-- LATE_AIRCRAFT_DELAY: string (nullable = true)
 |-- Unnamed: 19: string (nullable = true)
 |-- DEP_HR: string (nullable = true)

+----------+--------+-------+------+--------------------+----+------------------+--------+---------+-------

### Change Delays from String to Doubles
Finally, now that we have a Spark dataframe - we know that some variables need to be transformed into numerical values. So we convert the strings DEP_DELAY, ARR_DELAY and DEP_HR into Doubles.

In [7]:
from pyspark.sql.types import DateType
flts = flts.withColumn("DEP_DELAY", flts['DEP_DELAY'].cast(DoubleType())).withColumn('ARR_DELAY', flts['ARR_DELAY'].cast(DoubleType())).withColumn("DEP_HR", flts['DEP_HR'].cast(DoubleType()))
flts.show()
print(flts.schema)

+----------+--------+-------+------+--------------------+----+------------------+--------+---------+--------+---------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+-----------+------+
|   FL_DATE|TAIL_NUM|CARRIER|ORIGIN|    ORIGIN_CITY_NAME|DEST|    DEST_CITY_NAME|DEP_TIME|DEP_DELAY|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 19|DEP_HR|
+----------+--------+-------+------+--------------------+----+------------------+--------+---------+--------+---------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+-----------+------+
|2019-01-01|  N8974C|     9E|   AVL|       Asheville, NC| ATL|       Atlanta, GA|    1658|     -7.0|    1758|    -22.0|      0.0|                0|     0.0|          0.0|          0.0|      0.0|           0.0|                0.0|        0.0|  16.0|
|201

as the final step for the data preparation, we create a temporary SQL table named "flights". We will use this in the tasks to more easily manipulate the data in a "SQL-esque" environment

In [8]:
flts.createOrReplaceTempView("flights")

# Task 3 [45 points]
As a final task, your supervisor assigned to you to investigate if it is possible to train a linear
regression model that could predict the departure delay a flight may have by using, 
as input:
- its origin (column “ORIGIN”)
- its airways (column “CARRIER”)
- its departure time (column “DEP_TIME”)

Again you should use Python and DataFrames, this time with **MLlib**

You should pay attention to transform the string-based input features using the proper representation
format, and you should explain your choices.

Special attention should be given to the “DEP_TIME” column: 
your supervisor told you that you should only consider the corresponding hour of the day 
- (which, of course, should be transformed in a one-hot representation).

For your training and testing workflow you should first remove outliers (see Task 2) 
and then split your dataset into two parts:
- one that will be used for training reasons and it will contain 70% of the entries
- a second one containing the remaining entries andwhich will be used for the assessment of the model. 

No need to implement a more sophisticated evaluation process (e.g., based on k-fold) is required in this phase. 

Your code should:
- (a) prepare the feature vectors
- (b) prepare the training and testing datasets
- (c) train the model
- (d) print on the screen the first 10 predictions, i.e., pairs of feature vectors (in compact format) and predicted outputs, on the screen 
- (e) evaluate the accuracy of the model and display the corresponding metric on the screen. 

Your deliverables are the following:
- A Python file (named “task3.py”) containing the code of the preprocessing, training, and evaluation phase of your machine learning workflow.
- A report (named “task3.pdf”) explaining the basic intuition of your code and your design decisions and assumptions.
- A screenshot (named “task3.png”) showing the output of your machine learning workflow (e.g., showing the results in the console). 

## Removing outliers
here we find both the top and bottom 1 percentiles and use those values to refine the dataset we will use in our ML regression model

In [9]:
query = spark.sql("""
    SELECT
    ORIGIN
    , CARRIER
    , DEP_HR
    , DEP_DELAY
    FROM flights 
    order by DEP_DELAY desc;
    """)
perc1 = query.stat.approxQuantile("DEP_DELAY",[0.01],0.0)
perc1 = int(perc1[0])
perc99 = query.stat.approxQuantile("DEP_DELAY",[0.99],0.0)
perc99 = int(perc99[0])
print(perc1,' , ',perc99)

-15  ,  110


In [10]:
query = spark.sql("""
    SELECT
    ORIGIN
    , CARRIER
    , DEP_HR
    , DEP_DELAY
    FROM flights
    where DEP_DELAY between {} and {}
    order by DEP_DELAY desc;
    """.format(perc1,perc99))
query.show()

+------+-------+------+---------+
|ORIGIN|CARRIER|DEP_HR|DEP_DELAY|
+------+-------+------+---------+
|   TUS|     AA|  21.0|    110.0|
|   DFW|     AA|  23.0|    101.0|
|   GTF|     G4|  20.0|    101.0|
|   BLI|     G4|  18.0|    100.0|
|   CLT|     AA|  11.0|    100.0|
|   MCI|     AA|   7.0|    100.0|
|   PHX|     AA|  11.0|     96.0|
|   DFW|     AA|  14.0|     95.0|
|   AZA|     G4|  17.0|     94.0|
|   PHL|     AA|  null|     93.0|
|   CLT|     AA|  13.0|     83.0|
|   PHX|     AA|  22.0|     82.0|
|   TPA|     AA|  15.0|     81.0|
|   ORD|     AA|  21.0|     80.0|
|   CLT|     AA|  15.0|     79.0|
|   DFW|     AA|  11.0|     77.0|
|   RSW|     AA|  18.0|     71.0|
|   MIA|     AA|  18.0|     70.0|
|   MSN|     MQ|  19.0|     68.0|
|   ELP|     MQ|  17.0|     67.0|
+------+-------+------+---------+
only showing top 20 rows



## Prepare Feature Vectors
In the dataset, if we find any null values, we want to remove them. Another option would be to place a 0 here, but in this scenario that would lead to the creation of false data and would confuse our model.

Below, we can use the DEP_HR variable itself as it's numeric, but we need to One Hot Encode both ORIGIN and CARRIER as they are strings

In [11]:
prep = query.na.drop()
#prep.select("DEP_HR").distinct().orderBy("DEP_HR").show()

### Get Indexes

In [12]:
from pyspark.ml.feature import StringIndexer

ORGindexer = StringIndexer()\
.setInputCol("ORIGIN")\
.setOutputCol("ORIGIN_indx")

CARindexer = StringIndexer()\
.setInputCol("CARRIER")\
.setOutputCol("CARRIER_indx")

### One Hot Encode

In [13]:
from pyspark.ml.feature import OneHotEncoder

ORGencoder = OneHotEncoder(dropLast=False)\
.setInputCols(["ORIGIN_indx"])\
.setOutputCols(["ORIGIN_encd"])

CARencoder = OneHotEncoder(dropLast=False)\
.setInputCols(["CARRIER_indx"])\
.setOutputCols(["CARRIER_encd"])

### Vectorize
we need to place all of these One Hot Encoded options into a single vector so the ML model can easily read it

In [14]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler()\
.setInputCols(["ORIGIN_encd","CARRIER_encd","DEP_HR"])\
.setOutputCol("features")

### Initialize Pipeline
we tell Spark in which order we need it to make these transformations

In [15]:
from pyspark.ml import Pipeline
transformation_pipeline = Pipeline()\
.setStages([ORGindexer, CARindexer, ORGencoder, CARencoder, vector_assembler])
fitted_pipeline = transformation_pipeline.fit(prep)

### Regression dataset
here we have all the data we need and more

In [20]:
regr = fitted_pipeline.transform(prep)
regr.cache()
regr.show(5)

+------+-------+------+---------+-----------+------------+----------------+-------------+--------------------+
|ORIGIN|CARRIER|DEP_HR|DEP_DELAY|ORIGIN_indx|CARRIER_indx|     ORIGIN_encd| CARRIER_encd|            features|
+------+-------+------+---------+-----------+------------+----------------+-------------+--------------------+
|   TUS|     AA|  21.0|    110.0|       60.0|         0.0|(144,[60],[1.0])|(5,[0],[1.0])|(150,[60,144,149]...|
|   GTF|     G4|  20.0|    101.0|       71.0|         3.0|(144,[71],[1.0])|(5,[3],[1.0])|(150,[71,147,149]...|
|   DFW|     AA|  23.0|    101.0|        0.0|         0.0| (144,[0],[1.0])|(5,[0],[1.0])|(150,[0,144,149],...|
|   BLI|     G4|  18.0|    100.0|       55.0|         3.0|(144,[55],[1.0])|(5,[3],[1.0])|(150,[55,147,149]...|
|   CLT|     AA|  11.0|    100.0|        2.0|         0.0| (144,[2],[1.0])|(5,[0],[1.0])|(150,[2,144,149],...|
+------+-------+------+---------+-----------+------------+----------------+-------------+--------------------+
o

### SQL Temp table
again, we will place this data into a temporary SQL table so we can manipulate the dataset more easily

In [21]:
regr.createOrReplaceTempView("regr")

## Finalizing the dataset
before we begin the machine learning process, we need a dataset which has only the vectorized (and encoded) features and the variable we want to predict which we name here 'label'

In [30]:
rquery = spark.sql("""
    SELECT
    features
    , DEP_DELAY as label
    FROM regr
    group by features
    , DEP_DELAY;
    """)
rquery.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(150,[60,144,149]...|110.0|
|(150,[71,147,149]...|101.0|
|(150,[0,144,149],...|101.0|
|(150,[55,147,149]...|100.0|
|(150,[2,144,149],...|100.0|
|(150,[36,144,149]...|100.0|
|(150,[1,144,149],...| 96.0|
|(150,[0,144,149],...| 95.0|
|(150,[39,147,149]...| 94.0|
|(150,[2,144,149],...| 83.0|
|(150,[1,144,149],...| 82.0|
|(150,[20,144,149]...| 81.0|
|(150,[3,144,149],...| 80.0|
|(150,[2,144,149],...| 79.0|
|(150,[0,144,149],...| 77.0|
|(150,[18,144,149]...| 71.0|
|(150,[5,144,149],...| 70.0|
|(150,[126,148,149...| 68.0|
|(150,[67,148,149]...| 67.0|
|(150,[2,144,149],...| 66.0|
+--------------------+-----+
only showing top 20 rows



## Train Test Split
We will use a 70% 30% split as specified in the instructions. We will use a method called randomSplit and assign each of the chunks to variables named train and test respectively

In [31]:
split = rquery.randomSplit([0.7, 0.3])
train = split[0]
train.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(150,[2,144,149],...|100.0|
|(150,[55,147,149]...|100.0|
|(150,[60,144,149]...|110.0|
|(150,[71,147,149]...|101.0|
|(150,[0,144,149],...| 95.0|
|(150,[1,144,149],...| 96.0|
|(150,[2,144,149],...| 83.0|
|(150,[39,147,149]...| 94.0|
|(150,[0,144,149],...| 77.0|
|(150,[1,144,149],...| 82.0|
|(150,[20,144,149]...| 81.0|
|(150,[2,144,149],...| 66.0|
|(150,[18,144,149]...| 71.0|
|(150,[67,148,149]...| 67.0|
|(150,[126,148,149...| 68.0|
|(150,[3,144,149],...| 65.0|
|(150,[4,144,149],...| 60.0|
|(150,[16,144,149]...| 58.0|
|(150,[93,144,149]...| 58.0|
|(150,[96,147,149]...| 59.0|
+--------------------+-----+
only showing top 20 rows



In [32]:
test = split[1]
test.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(150,[0,144,149],...|101.0|
|(150,[36,144,149]...|100.0|
|(150,[2,144,149],...| 79.0|
|(150,[3,144,149],...| 80.0|
|(150,[5,144,149],...| 70.0|
|(150,[10,146,149]...| 59.0|
|(150,[0,144,149],...| 57.0|
|(150,[3,148,149],...| 52.0|
|(150,[22,144,149]...| 50.0|
|(150,[24,147,149]...| 49.0|
|(150,[58,144,149]...| 49.0|
|(150,[3,144,149],...| 48.0|
|(150,[5,144,149],...| 47.0|
|(150,[77,147,149]...| 47.0|
|(150,[1,144,149],...| 44.0|
|(150,[39,147,149]...| 45.0|
|(150,[40,144,149]...| 44.0|
|(150,[0,144,149],...| 42.0|
|(150,[39,147,149]...| 42.0|
|(150,[1,144,149],...| 35.0|
+--------------------+-----+
only showing top 20 rows



## Train the model
We will use a linear regression model and fit the dataset which we selected above

In [33]:
from pyspark.ml.regression import LinearRegression

# Define LinearRegression algorithm
lr = LinearRegression()

# Fit the model
model = lr.fit(rquery, {lr.regParam:0.0})

## Make predictions
we show a Spark dataframe we created containing both the original features and labels as well as the predictions our model made. We print the first 10 lines in the console

In [38]:
predictions = model.transform(rquery)
predictions.show(10)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|(150,[60,144,149]...|110.0| 41.74907470931341|
|(150,[71,147,149]...|101.0| 58.05154187284671|
|(150,[0,144,149],...|101.0|15.721209114835599|
|(150,[55,147,149]...|100.0| 32.15716083133794|
|(150,[2,144,149],...|100.0| 4.510098821539601|
|(150,[36,144,149]...|100.0|23.745331688256393|
|(150,[1,144,149],...| 96.0|10.802946022944319|
|(150,[0,144,149],...| 95.0| 9.972133713762286|
|(150,[39,147,149]...| 94.0| 37.06083788203532|
|(150,[2,144,149],...| 83.0| 5.787671132889225|
+--------------------+-----+------------------+
only showing top 10 rows



## Evalutate the model
Here we use both RMSE and MAE to evaluate our model. According to these metrics, our model doesn't look so good. We can see in the short sample above that the predictions seem wildly off. Now that all the pieces are in place, it will be easy to continue to tweak and refine our model until we meet a goal which we must set for the standard quality of our model.

In [39]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictions)
print("Model: Root Mean Squared Error = " + str(RMSE))

Model: Root Mean Squared Error = 15.734990564851342


In [25]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="mae")
MAE = evaluator.evaluate(predictions)
print("Model: Mean Absolute Error = " + str(MAE))

Model: Mean Absolute Error = 9.788315035869392
