In [0]:
# read in flights and weather data

from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, NullType, ShortType, DateType, BooleanType, BinaryType
from pyspark.sql import SQLContext

import numpy as np
import matplotlib.pyplot as plt
import datetime
from pyspark.sql.types import TimestampType, ArrayType, FloatType
import us
import pytz

sqlContext = SQLContext(sc)

airlines = spark.read.option("header", "true").parquet(f"dbfs:/mnt/mids-w261/datasets_final_project/parquet_airlines_data/201*.parquet")

weather = spark.read.option("header", "true")\
                    .parquet(f"dbfs:/mnt/mids-w261/datasets_final_project/weather_data/*.parquet")

In [0]:
# convert flight times to UTC

def convert_localToUTC(yearStr, monthStr, dayOfMonthStr, timeInt, USStateAbbrev):  
  year, month, dayOfMonth = (int(yearStr), int(monthStr), int(dayOfMonthStr))
  timeStr = str(timeInt).zfill(4)
  hour, minutes = (int(timeStr[0:2]), int(timeStr[2:]))

  if 0 < int(hour) < 23:
    localDatetime = datetime.datetime(year, month, dayOfMonth, hour, minutes)
  else:
    localDatetime = None

  stateObj = us.states.lookup(USStateAbbrev)

  if stateObj is None or localDatetime is None:
    utcDatetime = None
  else:  
    localTimezoneStr = stateObj.time_zones[0]
    pytzObj = pytz.timezone(localTimezoneStr)
    utcDatetime = pytzObj.localize(localDatetime).astimezone(pytz.utc)

  return utcDatetime

def floorAndSubtract_hours(inputDatetime, hoursToSubtract):
  if inputDatetime is None:
    flooredMinusHours = None
  else:
    flooredToHour = inputDatetime - datetime.timedelta(minutes=inputDatetime.minute)
    flooredMinusHours = flooredToHour - datetime.timedelta(hours=hoursToSubtract)
  
  return flooredMinusHours


udf_convert_localToUTC = f.udf(convert_localToUTC, TimestampType())
udf_subtract_hours = f.udf(floorAndSubtract_hours, TimestampType())

airlinesDf = airlines.withColumn('CRS_DEP_DATETIME_UTC', udf_convert_localToUTC('YEAR', 'MONTH', 'DAY_OF_MONTH', 'CRS_DEP_TIME', 'ORIGIN_STATE_ABR')) \
                      .withColumn('CRS_ARR_DATETIME_UTC', udf_convert_localToUTC('YEAR', 'MONTH', 'DAY_OF_MONTH', 'CRS_ARR_TIME', 'DEST_STATE_ABR')) \
                      .withColumn('datetime_2hoursBeforeDEP_floored', udf_subtract_hours('CRS_DEP_DATETIME_UTC', f.lit(2)))

In [0]:
# Feature Engineering - create a dataframe with metrics at each airport for each day

def mapper_convertToDict(reducedLine):
  date, airportStr = reducedLine[0]
  allDelayTimesStrs = reducedLine[1]
  allDelayTimes = [float(delay) for delay in allDelayTimesStrs]
  
  totNumFlights = len(allDelayTimesStrs)
  meanDelay = float(np.mean(allDelayTimes))
  propDelayed = float(sum(np.array(allDelayTimes) >= 15)/totNumFlights)
  
  airportDict = {airportStr: {'totNumFlights': totNumFlights, 'meanDelayMins': meanDelay, 'propDelayed': propDelayed}}
  
  return (date, airportDict)

def flatmap_dateAndAirport(line):
  flightDate, allAirportsDict = line
  
  for airport in list(allAirportsDict.keys()):
    yield (flightDate, airport, allAirportsDict[airport]['totNumFlights'], allAirportsDict[airport]['meanDelayMins'], allAirportsDict[airport]['propDelayed'])

# create a separate df with only delay data to work with
airlinesDelayedData = airlines.select(['ORIGIN', 'ORIGIN_AIRPORT_ID', 'FL_DATE', 'DEP_DELAY', 'DEP_DEL15', ])

delayMetricsDf = airlinesDelayedData.dropna().rdd.map(lambda line: [column for column in line]) \
                                                   .map(lambda line: ((datetime.datetime.strptime(line[2], '%Y-%m-%d'), line[0]), (str(line[3]),))) \
                                                   .reduceByKey(lambda currLine, nextLine: currLine + nextLine) \
                                                   .map(lambda line: mapper_convertToDict(line)) \
                                                   .flatMap(lambda line: flatmap_dateAndAirport(line)) \
                                                   .toDF(('flightDate', 'airportCode', 'totNumFlights', 'meanDelayAcrossFlights', 'propDelayed'))


In [0]:
# join delay metrics with flight data

def get_dayBeforeFlight(flightDateStr):
  flightDatetime = datetime.datetime.strptime(flightDateStr, '%Y-%m-%d')
  dayBeforeFlight = flightDatetime - datetime.timedelta(days=1)
  
  return dayBeforeFlight

udf_get_dayBeforeFlight = f.udf(get_dayBeforeFlight, TimestampType())

airlinesDf = airlinesDf.withColumn('dayBeforeFlightDate_forJoin', udf_get_dayBeforeFlight('FL_DATE')) \
                      
joinConds = [(airlinesDf.dayBeforeFlightDate_forJoin == delayMetricsDf.flightDate)
             , airlinesDf.ORIGIN == delayMetricsDf.airportCode]

airlinesJoinedFeatures = airlinesDf.join(delayMetricsDf, joinConds)

display(airlinesJoinedFeatures.sample(False, .00001))

YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,ORIGIN_CITY_NAME,ORIGIN_STATE_ABR,ORIGIN_STATE_FIPS,ORIGIN_STATE_NM,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,DEST_CITY_NAME,DEST_STATE_ABR,DEST_STATE_FIPS,DEST_STATE_NM,DEST_WAC,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DELAY_NEW,DEP_DEL15,DEP_DELAY_GROUP,DEP_TIME_BLK,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DELAY_NEW,ARR_DEL15,ARR_DELAY_GROUP,ARR_TIME_BLK,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,FIRST_DEP_TIME,TOTAL_ADD_GTIME,LONGEST_ADD_GTIME,DIV_AIRPORT_LANDINGS,DIV_REACHED_DEST,DIV_ACTUAL_ELAPSED_TIME,DIV_ARR_DELAY,DIV_DISTANCE,DIV1_AIRPORT,DIV1_AIRPORT_ID,DIV1_AIRPORT_SEQ_ID,DIV1_WHEELS_ON,DIV1_TOTAL_GTIME,DIV1_LONGEST_GTIME,DIV1_WHEELS_OFF,DIV1_TAIL_NUM,DIV2_AIRPORT,DIV2_AIRPORT_ID,DIV2_AIRPORT_SEQ_ID,DIV2_WHEELS_ON,DIV2_TOTAL_GTIME,DIV2_LONGEST_GTIME,DIV2_WHEELS_OFF,DIV2_TAIL_NUM,DIV3_AIRPORT,DIV3_AIRPORT_ID,DIV3_AIRPORT_SEQ_ID,DIV3_WHEELS_ON,DIV3_TOTAL_GTIME,DIV3_LONGEST_GTIME,DIV3_WHEELS_OFF,DIV3_TAIL_NUM,DIV4_AIRPORT,DIV4_AIRPORT_ID,DIV4_AIRPORT_SEQ_ID,DIV4_WHEELS_ON,DIV4_TOTAL_GTIME,DIV4_LONGEST_GTIME,DIV4_WHEELS_OFF,DIV4_TAIL_NUM,DIV5_AIRPORT,DIV5_AIRPORT_ID,DIV5_AIRPORT_SEQ_ID,DIV5_WHEELS_ON,DIV5_TOTAL_GTIME,DIV5_LONGEST_GTIME,DIV5_WHEELS_OFF,DIV5_TAIL_NUM,CRS_DEP_DATETIME_UTC,CRS_ARR_DATETIME_UTC,datetime_2hoursBeforeDEP_floored,dayBeforeFlightDate_forJoin,flightDate,airportCode,totNumFlights,meanDelayAcrossFlights,propDelayed
2016,2,5,21,6,2016-05-21,NK,20416,NK,N532NK,278,13577,1357702,31135,MYR,"Myrtle Beach, SC",SC,45,South Carolina,37,11146,1114604,31146,CRW,"Charleston/Dunbar, WV",WV,54,West Virginia,39,921,919.0,-2.0,0.0,0.0,-1.0,0900-0959,10.0,929.0,1024.0,5.0,1033,1029.0,-4.0,0.0,0.0,-1.0,1000-1059,0.0,,0.0,72.0,70.0,55.0,1.0,356.0,2,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-05-21T13:21:00.000+0000,2016-05-21T14:33:00.000+0000,2016-05-21T11:00:00.000+0000,2016-05-20T00:00:00.000+0000,2016-05-20T00:00:00.000+0000,MYR,18,3.2222222222222223,0.0555555555555555
2018,4,10,4,4,2018-10-04,AA,19805,AA,N803AW,1792,11066,1106606,31066,CMH,"Columbus, OH",OH,39,Ohio,44,14107,1410702,30466,PHX,"Phoenix, AZ",AZ,4,Arizona,81,1812,1807.0,-5.0,0.0,0.0,-1.0,1800-1859,10.0,1817.0,1856.0,19.0,1923,1915.0,-8.0,0.0,0.0,-1.0,1900-1959,0.0,,0.0,251.0,248.0,219.0,1.0,1670.0,7,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-10-04T22:12:00.000+0000,2018-10-05T02:23:00.000+0000,2018-10-04T20:00:00.000+0000,2018-10-03T00:00:00.000+0000,2018-10-03T00:00:00.000+0000,CMH,133,1.413533834586466,0.150375939849624
2016,4,12,13,2,2016-12-13,DL,19790,DL,N925AT,733,10529,1052904,30529,BDL,"Hartford, CT",CT,9,Connecticut,11,11433,1143302,31295,DTW,"Detroit, MI",MI,26,Michigan,43,1620,1616.0,-4.0,0.0,0.0,-1.0,1600-1659,12.0,1628.0,1805.0,5.0,1822,1810.0,-12.0,0.0,0.0,-1.0,1800-1859,0.0,,0.0,122.0,114.0,97.0,1.0,549.0,3,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-12-13T21:20:00.000+0000,2016-12-13T23:22:00.000+0000,2016-12-13T19:00:00.000+0000,2016-12-12T00:00:00.000+0000,2016-12-12T00:00:00.000+0000,BDL,61,11.672131147540984,0.2295081967213114
2017,3,8,21,1,2017-08-21,UA,19977,UA,N14237,2004,13930,1393004,30977,ORD,"Chicago, IL",IL,17,Illinois,41,14576,1457604,34576,ROC,"Rochester, NY",NY,36,New York,22,1347,1342.0,-5.0,0.0,0.0,-1.0,1300-1359,33.0,1415.0,1627.0,6.0,1632,1633.0,1.0,1.0,0.0,0.0,1600-1659,0.0,,0.0,105.0,111.0,72.0,1.0,528.0,3,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2017-08-21T18:47:00.000+0000,2017-08-21T20:32:00.000+0000,2017-08-21T16:00:00.000+0000,2017-08-20T00:00:00.000+0000,2017-08-20T00:00:00.000+0000,ORD,845,9.398816568047335,0.1928994082840236
2019,2,4,10,3,2019-04-10,YX,20452,YX,N878RW,5910,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,14122,1412202,30198,PIT,"Pittsburgh, PA",PA,42,Pennsylvania,23,1042,1035.0,-7.0,0.0,0.0,-1.0,1000-1059,15.0,1050.0,1221.0,5.0,1236,1226.0,-10.0,0.0,0.0,-1.0,1200-1259,0.0,,0.0,114.0,111.0,91.0,1.0,496.0,2,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019-04-10T14:42:00.000+0000,2019-04-10T16:36:00.000+0000,2019-04-10T12:00:00.000+0000,2019-04-09T00:00:00.000+0000,2019-04-09T00:00:00.000+0000,BOS,444,8.842342342342342,0.2162162162162162
2018,3,7,6,5,2018-07-06,B6,20409,B6,N589JB,1729,12478,1247805,31703,JFK,"New York, NY",NY,36,New York,22,14635,1463502,31714,RSW,"Fort Myers, FL",FL,12,Florida,33,1451,1655.0,124.0,124.0,1.0,8.0,1400-1459,22.0,1717.0,1950.0,3.0,1804,1953.0,109.0,109.0,1.0,7.0,1800-1859,0.0,,0.0,193.0,178.0,153.0,1.0,1074.0,5,11.0,0.0,0.0,0.0,98.0,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2018-07-06T18:51:00.000+0000,2018-07-06T22:04:00.000+0000,2018-07-06T16:00:00.000+0000,2018-07-05T00:00:00.000+0000,2018-07-05T00:00:00.000+0000,JFK,355,14.394366197183098,0.2901408450704225
2015,4,11,24,2,2015-11-24,AA,19805,AA,N3JPAA,1017,13303,1330303,32467,MIA,"Miami, FL",FL,12,Florida,33,13930,1393004,30977,ORD,"Chicago, IL",IL,17,Illinois,41,1215,1211.0,-4.0,0.0,0.0,-1.0,1200-1259,10.0,1221.0,1410.0,10.0,1440,1420.0,-20.0,0.0,0.0,-2.0,1400-1459,0.0,,0.0,205.0,189.0,169.0,1.0,1197.0,5,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-11-24T17:15:00.000+0000,2015-11-24T20:40:00.000+0000,2015-11-24T15:00:00.000+0000,2015-11-23T00:00:00.000+0000,2015-11-23T00:00:00.000+0000,MIA,193,4.497409326424871,0.1502590673575129
2015,3,8,15,6,2015-08-15,WN,19393,WN,N759GS,1736,14893,1489302,33192,SMF,"Sacramento, CA",CA,6,California,91,11292,1129202,30325,DEN,"Denver, CO",CO,8,Colorado,82,1000,1001.0,1.0,1.0,0.0,0.0,1000-1059,9.0,1010.0,1306.0,8.0,1315,1314.0,-1.0,0.0,0.0,-1.0,1300-1359,0.0,,0.0,135.0,133.0,116.0,1.0,909.0,4,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-08-15T17:00:00.000+0000,2015-08-15T19:15:00.000+0000,2015-08-15T15:00:00.000+0000,2015-08-14T00:00:00.000+0000,2015-08-14T00:00:00.000+0000,SMF,123,19.45528455284553,0.2926829268292683
2016,2,4,15,5,2016-04-15,NK,20416,NK,N505NK,330,11298,1129804,30194,DFW,"Dallas/Fort Worth, TX",TX,48,Texas,74,11042,1104203,30647,CLE,"Cleveland, OH",OH,39,Ohio,44,2029,2026.0,-3.0,0.0,0.0,-1.0,2000-2059,14.0,2040.0,2348.0,8.0,5,2356.0,-9.0,0.0,0.0,-1.0,0001-0559,0.0,,0.0,156.0,150.0,128.0,1.0,1021.0,5,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2016-04-16T01:29:00.000+0000,,2016-04-15T23:00:00.000+0000,2016-04-14T00:00:00.000+0000,2016-04-14T00:00:00.000+0000,DFW,555,3.1333333333333333,0.1207207207207207
2015,1,1,15,4,2015-01-15,AA,19805,AA,N3APAA,1670,10721,1072102,30721,BOS,"Boston, MA",MA,25,Massachusetts,13,13930,1393003,30977,ORD,"Chicago, IL",IL,17,Illinois,41,1758,1759.0,1.0,1.0,0.0,0.0,1700-1759,73.0,1912.0,2025.0,13.0,2000,2038.0,38.0,38.0,1.0,2.0,2000-2059,0.0,,0.0,182.0,219.0,133.0,1.0,867.0,4,0.0,0.0,38.0,0.0,0.0,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2015-01-15T22:58:00.000+0000,2015-01-16T02:00:00.000+0000,2015-01-15T20:00:00.000+0000,2015-01-14T00:00:00.000+0000,2015-01-14T00:00:00.000+0000,BOS,299,4.832775919732441,0.1404682274247491
