<a href="https://colab.research.google.com/github/NicoleLund/flight_delay_prediction/blob/nrl_210817/data_manipulation_modeling/feature_assessment/feature_assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

feature_assessment.ipynb
-----

Written in Google Colab

by Nicole Lund

This workbook investigates the correlation between features in 2017 flight performance prior to building a model from the data.

In [32]:
# Import dependencies
import pandas as pd
import datetime
from datetime import timedelta
import math

In [2]:
# Read the CSV file from AWS to Pandas Dataframe
url = "https://finalproject-3.s3.us-west-1.amazonaws.com/2017_TUS.csv"
df = pd.read_csv(url)

df.head(3)

Unnamed: 0,origin_city_name,dest_city_name,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,CRS_DEP_HM,DEP_TIME_HM,WHEELS_OFF_HM,WHEELS_ON_HM,CRS_ARR_TIME_HM,ARR_TIME_HM,day_of_week,DELAY
0,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-01,DL,2066,TUS,ATL,1430,1459.0,29.0,10.0,1509.0,2011.0,17.0,2003,2028.0,25.0,0.0,,0.0,213.0,209.0,182.0,1541.0,11.0,0.0,0.0,0.0,14.0,14:30,14:59,15:09,20:11,20:03,20:28,Sunday,0
1,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,1127,TUS,ATL,600,637.0,37.0,10.0,647.0,1157.0,6.0,1129,1203.0,34.0,0.0,,0.0,209.0,206.0,190.0,1541.0,34.0,0.0,0.0,0.0,0.0,06:00,06:37,06:47,11:57,11:29,12:03,Monday,1
2,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-02,DL,2066,TUS,ATL,1430,1447.0,17.0,10.0,1457.0,1943.0,9.0,2005,1952.0,-13.0,0.0,,0.0,215.0,185.0,166.0,1541.0,,,,,,14:30,14:47,14:57,19:43,20:05,19:52,Monday,0


In [19]:
# Check for any international destinations
df.dest_city_name.unique()

array(['ATLANTA, GEORGIA, USA              ',
       'DENVER, COLORADO, USA              ',
       'DALLAS/FT.WORTH, TEXAS, USA        ',
       'HOUSTON, TEXAS, USA                ',
       'NEW YORK, NEW YORK, USA            ',
       'LAS VEGAS, NEVADA, USA             ',
       'LOS ANGELES, CALIFORNIA, USA       ',
       'CHICAGO, ILLINOIS, USA             ',
       'MINNEAPOLIS/ST.PAUL, MINNESOTA, USA',
       'OAKLAND, CALIFORNIA, USA           ',
       'PORTLAND, OREGON, USA              ',
       'PHOENIX, ARIZONA, USA              ',
       'SAN DIEGO, CALIFORNIA, USA         ',
       'SEATTLE, WASHINGTON, USA           ',
       'SAN FRANCISCO, CALIFORNIA, USA     ',
       'SAN JOSE, CALIFORNIA, USA          ',
       'SALT LAKE CITY, UTAH, USA          '], dtype=object)

In [56]:
# Identify row to verify calculations with
row = 0
print(f'Flight path: {df.origin_city_name[row]} to {df.dest_city_name[row]}')

Flight path: TUCSON, ARIZONA, USA                to ATLANTA, GEORGIA, USA              


In [48]:
# Verify DEP_DELAY calculation
# DEP_DELAY = DEP_TIME - CRS_DEP_HM
print(f'Actual Departure Time: {df.DEP_TIME_HM[row]}')
print(f'Scheduled Departure Time: {df.CRS_DEP_HM[row]}')
print(f'Reported Departure Delay: {df.DEP_DELAY[row]}')

Actual Departure Time: 14:59
Scheduled Departure Time: 14:30
Reported Departure Delay: 29.0


In [50]:
# Verify TAXI_OUT calculation
# TAXI_OUT = WHEELS_OFF - DEP_TIME
print(f'WHEELS OFF time: {df.WHEELS_OFF_HM[row]}')
print(f'Departure time: {df.DEP_TIME_HM[row]}')
print(f'Taxi time: {df.TAXI_OUT[row]}')

WHEELS OFF time: 15:09
Departure time: 14:59
Taxi time: 10.0


In [51]:
# Verify TAXI_IN calculation
# TAXI_IN = ARR_TIME - WHEELS_ON
print(f'Arrival time: {df.ARR_TIME_HM[row]}')
print(f'Wheels on time: {df.WHEELS_ON_HM[row]}')
print(f'Taxi time: {df.TAXI_IN[row]}')

Arrival time: 20:28
Wheels on time: 20:11
Taxi time: 17.0


In [58]:
# Verify AIR_TIME calculation
# AIR_TIME = WHEELS_ON - WHEELS_OFF + time zone difference
print(f'Wheels on time: {df.WHEELS_ON_HM[row]}')
print(f'WHEELS OFF time: {df.WHEELS_OFF_HM[row]}')
print(f'Air time: {df.AIR_TIME[row]}')
print(f'Air time hours: {math.floor(df.AIR_TIME[row]/60)}')
print(f'Air time minutes: {df.AIR_TIME[row] - 60*math.floor(df.AIR_TIME[row]/60)}')

Wheels on time: 20:11
WHEELS OFF time: 15:09
Air time: 182.0
Air time hours: 3
Air time minutes: 2.0


In [39]:
# Verify ACTUAL_ELAPSED_TIME calculation
# ACTUAL_ELAPSED_TIME = TAXI_OUT + TAXI_IN + AIR_TIME
print(f'Reported elapsed time: {df.ACTUAL_ELAPSED_TIME[row]}')
actual_elapsed_time = df.TAXI_OUT[row]+df.TAXI_IN[row]+df.AIR_TIME[row]
print(f'Calculated elapsed time: {actual_elapsed_time}')

Reported elapsed time: 209.0
Calculated elapsed time: 209.0


In [59]:
# Verify Arrival time calculation
# ARR_TIME = DEP_TIME + ACTUAL_ELAPSED_TIME + time zone difference
print(f'Departure time: {df.DEP_TIME_HM[row]}')
print(f'Arrival time: {df.ARR_TIME_HM[row]}')
print(f'Reported elapsed time: {df.ACTUAL_ELAPSED_TIME[row]}')
print(f'Elapsed time hours: {math.floor(df.ACTUAL_ELAPSED_TIME[row]/60)}')
print(f'Elapsed time minutes: {df.ACTUAL_ELAPSED_TIME[row] - 60*math.floor(df.ACTUAL_ELAPSED_TIME[row]/60)}')

Departure time: 14:59
Arrival time: 20:28
Reported elapsed time: 209.0
Elapsed time hours: 3
Elapsed time minutes: 29.0


In [41]:
# Verify ARR_DELAY calculation from subset delays
# ARR_DELAY = CARRIER_DELAY + WEATHER_DELAY + NAS_DELAY + SECURITY_DELAY + LATE_AIRCRAFT_DELAY
print(f'Reported Total Arrival Delay: {df.ARR_DELAY[row]}')
arr_delay = df.CARRIER_DELAY[row]+df.WEATHER_DELAY[row] + df.NAS_DELAY[row] + df.SECURITY_DELAY[row] + df.LATE_AIRCRAFT_DELAY[row]
print(f'Calculated Total Arrival Delay: {arr_delay}')

Reported Total Arrival Delay: 25.0
Calculated Total Arrival Delay: 25.0


In [60]:
# Verify if ARR_DELAY is directly calculable from reported times
# ARR_DELAY = ARR_TIME - CRS_ARR_HM = CRS_DEP_HM + DEP_DELAY + TAXI_OUT + AIR_TIME + TAXI_IN + time zone difference
print(f'Scheduled Departure Time: {df.CRS_DEP_HM[row]}')
transit_time = df.DEP_DELAY[row] + df.TAXI_OUT[row] + df.AIR_TIME[row] + df.TAXI_IN[row]
print(f'Calculated Transit time: {transit_time}')
print(f'Transit time hours: {math.floor(transit_time/60)}')
print(f'Transit time minutes: {transit_time - 60*math.floor(transit_time/60)}')
print('')
print(f'Actual Arrival time: {df.ARR_TIME_HM[row]}')
print(f'Scheduled Arrival Time: {df.CRS_ARR_TIME_HM[row]}')
print(f'Arrival Delay: {df.ARR_DELAY[row]}')

Scheduled Departure Time: 14:30
Calculated Transit time: 238.0
Transit time hours: 3
Transit time minutes: 58.0

Actual Arrival time: 20:28
Scheduled Arrival Time: 20:03
Arrival Delay: 25.0


In [64]:
# Review Diverted Values
print(f'DIVERTED values: {df.DIVERTED.unique()}')
diverted_df = df.loc[df.DIVERTED == 1]
diverted_df.head(3)

DIVERTED values: [0. 1.]


Unnamed: 0,origin_city_name,dest_city_name,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,CRS_DEP_HM,DEP_TIME_HM,WHEELS_OFF_HM,WHEELS_ON_HM,CRS_ARR_TIME_HM,ARR_TIME_HM,day_of_week,DELAY
51,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-29,DL,652,TUS,ATL,1451,1602.0,71.0,11.0,1613.0,123.0,11.0,2027,134.0,,0.0,,1.0,216.0,,,1541.0,,,,,,14:51,16:02,16:13,12:03,20:27,13:04,Sunday,1
329,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-06-23,DL,1240,TUS,ATL,1225,1230.0,5.0,32.0,1302.0,127.0,8.0,1915,135.0,,0.0,,1.0,230.0,,,1541.0,,,,,,12:25,12:30,13:02,12:07,19:15,13:05,Friday,1
362,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-07-11,DL,1240,TUS,ATL,1223,1232.0,9.0,10.0,1242.0,2000.0,21.0,1914,2021.0,,0.0,,1.0,231.0,,,1541.0,,,,,,12:23,12:32,12:42,20:00,19:14,20:21,Tuesday,1


In [65]:
# Review Cancelled Values
print(f'CANCELLED values: {df.CANCELLED.unique()}')
cancelled_df = df.loc[df.CANCELLED == 1]
cancelled_df.head(3)

CANCELLED values: [0. 1.]


Unnamed: 0,origin_city_name,dest_city_name,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,CRS_DEP_HM,DEP_TIME_HM,WHEELS_OFF_HM,WHEELS_ON_HM,CRS_ARR_TIME_HM,ARR_TIME_HM,day_of_week,DELAY
41,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-01-23,DL,2636,TUS,ATL,700,,,,,,,1230,,,1.0,B,0.0,210.0,,,1541.0,,,,,,07:00,,,,12:30,,Monday,1
172,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-04-05,DL,1240,TUS,ATL,1330,,,,,,,2012,,,1.0,A,0.0,222.0,,,1541.0,,,,,,13:30,,,,20:12,,Wednesday,1
176,"TUCSON, ARIZONA, USA","ATLANTA, GEORGIA, USA",2017-04-07,DL,1240,TUS,ATL,1330,,,,,,,2012,,,1.0,A,0.0,222.0,,,1541.0,,,,,,13:30,,,,20:12,,Friday,1


In [67]:
# Review CANCELLATION_CODE Values
codes = cancelled_df.CANCELLATION_CODE.unique()
for code in codes:
  num_found = cancelled_df.FL_DATE[cancelled_df.CANCELLATION_CODE == code].count()
  print(f"Cancellation Code {code}: {num_found}")

Cancellation Code B: 61
Cancellation Code A: 20
Cancellation Code C: 10


## Logical assessment of features

**Features to include in the model**

X values, Values known by customer
* 'OP_CARRIER': airline designation
* 'OP_CARRIER_FL_NUM': flight number
* 'day_of_week': flight day of the week
* 'DEST': destination airport code
* 'CRS_DEP_TIME': scheduled departure time 
* 'CRS_ARR_TIME': scheduled arrival time
* 'DISTANCE': flight distance

Y values, Values customer wants predicted
* 'CANCELLED': flight cancelled, [0, 1]
* 'DIVERTED': flight diverted, [0, 1]
* 'DELAY': arrival time delay, [0, 1]
  * 0 = Delayed <30 minutes
  * 1 = Delayed >=30 minutes

**Features not to include in the model due to irrelevance**
* 'ORIGIN': departure city, filtered for TUS only
* 'origin_city_name': departure city calculated from ORIGIN
* 'dest_city_name': destination city calculated from DEST
* 'FL_DATE': flight date provides day_of_week calculation
* 'CANCELLATION_CODE': reason for cancellation
* 'CARRIER_DELAY': arrival delay time due to carrier (minutes)
* 'WEATHER_DELAY': arrival delay time due to weather (minutes)
* 'NAS_DELAY': arrival delay time due to NAS (minutes)
* 'SECURITY_DELAY': arrival delay time due to security (minutes)
* 'LATE_AIRCRAFT_DELAY': arrival delay time due to aircraft (minutes)
* 'CRS_ELAPSED_TIME': scheduled elapsed time (minutes)
* 'WHEELS_OFF': actual wheels off time, DEP_TIME + TAXI_OUT
* 'WHEELS_ON': actual wheels on time, ARR_TIME - TAXI_IN

**Features not to include in the model due to they directly reveal Y values**
* 'DEP_DELAY': departure delay time (minutes)
* 'DEP_TIME': actual departure time, CRS_DEP_TIME + DEP_DELAY
* 'TAXI_OUT': calculated time spent in taxi between departure time and wheels off (minutes)
* 'TAXI_IN': calculated time spent in taxi between wheels on and arrival time
* 'AIR_TIME': calculated time spent in the air (minutes)
* 'ACTUAL_ELAPSED_TIME': TAXI_OUT + TAXI_IN + AIR_TIME (minutes)
* 'ARR_TIME': actual arrival time, DEP_TIME + ACTUAL_ELAPSED_TIME + time zone difference
* ARR_DELAY: actual arrival time delay (minutes)

**Time features in human readable format**
* 'CRS_DEP_HM', Scheduled departure time
* 'DEP_TIME_HM', Actual departure time
* 'WHEELS_OFF_HM', Wheels off time
* 'WHEELS_ON_HM', Wheels on time
* 'CRS_ARR_TIME_HM', Scheduled arrival time
* 'ARR_TIME_HM', Actual arrival time

In [20]:
# List all of the column headers
df.columns

Index(['origin_city_name', 'dest_city_name', 'FL_DATE', 'OP_CARRIER',
       'OP_CARRIER_FL_NUM', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'TAXI_OUT', 'WHEELS_OFF', 'WHEELS_ON', 'TAXI_IN',
       'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY', 'CANCELLED',
       'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'DISTANCE', 'CARRIER_DELAY',
       'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY',
       'CRS_DEP_HM', 'DEP_TIME_HM', 'WHEELS_OFF_HM', 'WHEELS_ON_HM',
       'CRS_ARR_TIME_HM', 'ARR_TIME_HM', 'day_of_week', 'DELAY'],
      dtype='object')

# Define model

In [101]:
# Define model variables

# Model input
X_df = df[['OP_CARRIER', 'OP_CARRIER_FL_NUM', 'day_of_week', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'DISTANCE']]
X_df.DISTANCE = X_df.DISTANCE.astype(int)

# Model output
y_df = df[['CANCELLED', 'DIVERTED', 'DELAY']]
y_df.CANCELLED = y_df.CANCELLED.astype(int)
y_df.DIVERTED = y_df.DIVERTED.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [102]:
# Review model input
print(X_df.info())
X_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15406 entries, 0 to 15405
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   OP_CARRIER         15406 non-null  object
 1   OP_CARRIER_FL_NUM  15406 non-null  int64 
 2   day_of_week        15406 non-null  object
 3   DEST               15406 non-null  object
 4   CRS_DEP_TIME       15406 non-null  int64 
 5   CRS_ARR_TIME       15406 non-null  int64 
 6   DISTANCE           15406 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 842.6+ KB
None


Unnamed: 0,OP_CARRIER,OP_CARRIER_FL_NUM,day_of_week,DEST,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE
0,DL,2066,Sunday,ATL,1430,2003,1541
1,DL,1127,Monday,ATL,600,1129,1541
2,DL,2066,Monday,ATL,1430,2005,1541


In [103]:
# Review model output
print(y_df.info())
y_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15406 entries, 0 to 15405
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   CANCELLED  15406 non-null  int64
 1   DIVERTED   15406 non-null  int64
 2   DELAY      15406 non-null  int64
dtypes: int64(3)
memory usage: 361.2 KB
None


Unnamed: 0,CANCELLED,DIVERTED,DELAY
0,0,0,0
1,0,0,1
2,0,0,0


In [104]:
# Review correlation of numerical input values
correlation_df = X_df.corr()
correlation_df

Unnamed: 0,OP_CARRIER_FL_NUM,CRS_DEP_TIME,CRS_ARR_TIME,DISTANCE
OP_CARRIER_FL_NUM,1.0,-0.066724,-0.195604,-0.285792
CRS_DEP_TIME,-0.066724,1.0,0.912463,-0.16338
CRS_ARR_TIME,-0.195604,0.912463,1.0,0.199614
DISTANCE,-0.285792,-0.16338,0.199614,1.0


The Pearson correlation between CRS_DEP_TIME and CRS_ARR_TIME as might be suspected. Modeling efforts will need to evaluate if CRS_ARR_TIME should be removed.