In [1]:
import pandas as pd
from zipfile import ZipFile 
pd.set_option("display.max_columns",1000)

import findspark
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
findspark.init()
#findspark.init('C:\Spark\spark-3.1.1-bin-hadoop2.7')
findspark.find()

'C:\\Spark\\spark-3.1.1-bin-hadoop2.7'

## Create Spark df
In order to work with spark within python, we need to create a Spark Session. Here we call the app ' Project1 ' and we assign this to a python object named 'spark'

In [2]:
spark = SparkSession.builder.appName('Project1').getOrCreate()#.master("local") #?

In [3]:
spark.catalog.listTables()

[]

### CSV to python df
We pull in the flights.csv first as a pandas dataframe in python. Within python, we can do some basic cleaning and manipulation for the tasks ahead.

*important*: 

Here, in preparation for task 3, we create a column named 'DEP_HR' which is a string containing just the hour that the flight departed. We will use this as our y variable in our regression model

In [43]:
df = pd.read_csv('flights.csv.zip')
df = df.fillna(0)
df['DEP_TIME'] = df['DEP_TIME'].astype('int')
df['DEP_HR'] = df['DEP_TIME'].astype('str').str[:-2] 
df['ARR_TIME'] = df['ARR_TIME'].astype('int')
df = df.fillna(0)
df = df.astype('str')
df

Unnamed: 0,FL_DATE,TAIL_NUM,CARRIER,ORIGIN,ORIGIN_CITY_NAME,DEST,DEST_CITY_NAME,DEP_TIME,DEP_DELAY,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 19,DEP_HR
0,2019-01-01,N8974C,9E,AVL,"Asheville, NC",ATL,"Atlanta, GA",1658,-7.0,1758,-22.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16
1,2019-01-01,N922XJ,9E,JFK,"New York, NY",RDU,"Raleigh/Durham, NC",1122,-8.0,1255,-29.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11
2,2019-01-01,N326PQ,9E,CLE,"Cleveland, OH",DTW,"Detroit, MI",1334,-7.0,1417,-31.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13
3,2019-01-01,N135EV,9E,BHM,"Birmingham, AL",ATL,"Atlanta, GA",1059,-1.0,1255,-8.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
4,2019-01-01,N914XJ,9E,GTF,"Great Falls, MT",MSP,"Minneapolis, MN",1057,-3.0,1418,-17.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2019-01-01,N257NN,MQ,STL,"St. Louis, MO",ORD,"Chicago, IL",1220,14.0,1327,-7.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12
996,2019-01-01,N855AE,MQ,LGA,"New York, NY",CMH,"Columbus, OH",1048,-12.0,1233,-34.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
997,2019-01-01,N688AE,MQ,ORD,"Chicago, IL",COU,"Columbia, MO",2317,52.0,104,80.0,0.0,0,0.0,0.0,0.0,28.0,0.0,52.0,0.0,23
998,2019-01-01,N262NN,MQ,MSN,"Madison, WI",ORD,"Chicago, IL",0,0.0,0,0.0,1.0,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


### Define Schema
for ease of loading into Spark, we make everything a string initially

In [44]:
from pyspark.sql.types import StructType, IntegerType, DateType
from pyspark.sql.types import *

schema = StructType([
    StructField("FL_DATE", StringType()),#, DateType()),
    StructField("TAIL_NUM", StringType()),
    StructField("CARRIER", StringType()),
    StructField("ORIGIN", StringType()),
    StructField("ORIGIN_CITY_NAME", StringType()),
    StructField("DEST", StringType()),
    StructField("DEST_CITY_NAME", StringType()),
    StructField("DEP_TIME", StringType()),
    StructField("DEP_DELAY", StringType()),#, DoubleType()),
    StructField("ARR_TIME", StringType()),
    StructField("ARR_DELAY", StringType()),#, DoubleType()),
    StructField("CANCELLED", StringType()),
    StructField("CANCELLATION_CODE", StringType()),
    StructField("DIVERTED", StringType()),
    StructField("CARRIER_DELAY", StringType()),
    StructField("WEATHER_DELAY", StringType()),
    StructField("NAS_DELAY", StringType()),
    StructField("SECURITY_DELAY", StringType()),
    StructField("LATE_AIRCRAFT_DELAY", StringType()),
    StructField("Unnamed: 19", StringType()),#, IntegerType())
    StructField("DEP_HR", StringType())
])

### Create Spark DF from pandas df
Now we create a Spark dataframe from our pandas dataframe in python. We print the schema just to review it and verify that everything worked correctly

In [45]:
#Create PySpark DataFrame from Pandas
flts=spark.createDataFrame(df,schema=schema) 
flts.printSchema()
flts.show()

root
 |-- FL_DATE: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- CARRIER: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable = true)
 |-- DIVERTED: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: string (nullable = true)
 |-- NAS_DELAY: string (nullable = true)
 |-- SECURITY_DELAY: string (nullable = true)
 |-- LATE_AIRCRAFT_DELAY: string (nullable = true)
 |-- Unnamed: 19: string (nullable = true)
 |-- DEP_HR: string (nullable = true)

+----------+--------+-------+------+--------------------+----+------------------+--------+---------+-------

### Change Delays from String to Doubles
Finally, now that we have a Spark dataframe - we know that some variables need to be transformed into numerical values. So we convert the strings DEP_DELAY, ARR_DELAY and DEP_HR into Doubles.

In [7]:
from pyspark.sql.types import DateType
flts = flts.withColumn("DEP_DELAY", flts['DEP_DELAY'].cast(DoubleType())).withColumn('ARR_DELAY', flts['ARR_DELAY'].cast(DoubleType())).withColumn("DEP_HR", flts['DEP_HR'].cast(DoubleType()))
flts.show()
print(flts.schema)

+----------+--------+-------+------+--------------------+----+------------------+--------+---------+--------+---------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+-----------+------+
|   FL_DATE|TAIL_NUM|CARRIER|ORIGIN|    ORIGIN_CITY_NAME|DEST|    DEST_CITY_NAME|DEP_TIME|DEP_DELAY|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 19|DEP_HR|
+----------+--------+-------+------+--------------------+----+------------------+--------+---------+--------+---------+---------+-----------------+--------+-------------+-------------+---------+--------------+-------------------+-----------+------+
|2019-01-01|  N8974C|     9E|   AVL|       Asheville, NC| ATL|       Atlanta, GA|    1658|     -7.0|    1758|    -22.0|      0.0|                0|     0.0|          0.0|          0.0|      0.0|           0.0|                0.0|        0.0|  16.0|
|201

as the final step for the data preparation, we create a temporary SQL table named "flights". We will use this in the tasks to more easily manipulate the data in a "SQL-esque" environment

In [8]:
flts.createOrReplaceTempView("flights")

# Task 1 [20 points]
Your first task is to calculate the average flight delays in the dataset. Your supervisor made it
clear that you should choose SparkSQL with Python and DataFrames, so that your code should
be compatible with other software products of your company. Your deliverables for this task
are the following:
- A Python file (named “task1.py”) containing the code to produce the desired result.
- A report (named “task1.pdf”) explaining the basic intuition of your code.
- A screenshot (named “task1.png”) of the produced output (e.g., showing the result in
the console). 

## Departure Delay
Here we leverage the convencience of Spark and use simple SQL syntax to get the average of the DEP_DELAY column from our flights table

In [9]:
query = spark.sql("""
    SELECT avg(DEP_DELAY)
    FROM flights 
    """)
query.show()

+--------------+
|avg(DEP_DELAY)|
+--------------+
|         6.596|
+--------------+



## Arrival Delay
likewise, we use the same technique to find the average of ARR_DELAY

In [10]:
query = spark.sql("""
    SELECT avg(ARR_DELAY)
    FROM flights 
    """)
query.show()

+--------------+
|avg(ARR_DELAY)|
+--------------+
|         0.532|
+--------------+

