In [2]:
#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Read the flights csv and look at the data
'''The data is a sample data of 200000 datapoints pulled by another group member'''
df = pd.read_csv('data/flights.csv')
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-05-31,WN,WN,WN,2081,WN,N402WN,2081,10800,BUR,...,326,0.0,0.0,51.0,0.0,0.0,,,,
1,2019-04-20,AA,AA,AA,2244,AA,N895NN,2244,13930,ORD,...,801,,,,,,,,,
2,2019-02-13,WN,WN,WN,1731,WN,N8507C,1731,13495,MSY,...,302,0.0,0.0,0.0,0.0,18.0,,,,
3,2018-11-20,AA,AA,AA,2620,AA,N961AN,2620,11298,DFW,...,761,,,,,,,,,
4,2018-08-28,DL,DL_CODESHARE,DL,4060,9E,N833AY,4060,12478,JFK,...,228,,,,,,,,,


In [4]:
#Looking at columns and null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 42 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              200000 non-null  object 
 1   mkt_unique_carrier   200000 non-null  object 
 2   branded_code_share   200000 non-null  object 
 3   mkt_carrier          200000 non-null  object 
 4   mkt_carrier_fl_num   200000 non-null  int64  
 5   op_unique_carrier    200000 non-null  object 
 6   tail_num             199399 non-null  object 
 7   op_carrier_fl_num    200000 non-null  int64  
 8   origin_airport_id    200000 non-null  int64  
 9   origin               200000 non-null  object 
 10  origin_city_name     200000 non-null  object 
 11  dest_airport_id      200000 non-null  int64  
 12  dest                 200000 non-null  object 
 13  dest_city_name       200000 non-null  object 
 14  crs_dep_time         200000 non-null  int64  
 15  dep_time         

In [5]:
#Removing all features deemed irrelevant to the delay predictions

'''Some features have far too many null values, some others like dup, 
flights and tail_num do not seem to be relevant to flight delay.
There are also a lot of repetition of information such as taxi in/out and wheels in/out'''

to_drop = ['branded_code_share',
          'mkt_carrier',
          'mkt_carrier_fl_num',
          'op_unique_carrier',
          'tail_num',
          'op_carrier_fl_num',
          'origin_airport_id',
          'origin_city_name',
          'dest_airport_id',
          'dest_city_name',
          'wheels_off',
          'wheels_on',
          'cancellation_code',
          'dup',
          'air_time',
          'flights',
          'first_dep_time',
          'total_add_gtime',
          'longest_add_gtime',
          'no_name']

df_clean1 = df.drop(to_drop, axis=1)
df_clean1.head()

Unnamed: 0,fl_date,mkt_unique_carrier,origin,dest,crs_dep_time,dep_time,dep_delay,taxi_out,taxi_in,crs_arr_time,...,cancelled,diverted,crs_elapsed_time,actual_elapsed_time,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2018-05-31,WN,BUR,SFO,1100,1138.0,38.0,18.0,7.0,1220,...,0,0,80,93.0,326,0.0,0.0,51.0,0.0,0.0
1,2019-04-20,AA,ORD,DFW,1311,1303.0,-8.0,17.0,8.0,1544,...,0,0,153,126.0,801,,,,,
2,2019-02-13,WN,MSY,HOU,2130,2159.0,29.0,5.0,4.0,2245,...,0,0,75,64.0,302,0.0,0.0,0.0,0.0,18.0
3,2018-11-20,AA,DFW,IND,1834,1833.0,-1.0,30.0,10.0,2142,...,0,0,128,142.0,761,,,,,
4,2018-08-28,DL,JFK,IAD,1359,1353.0,-6.0,16.0,3.0,1537,...,0,0,98,70.0,228,,,,,


In [6]:
#Remove all the rows with cancelled flights then dropping the cancelled feature
df_clean2 = df_clean1[df_clean1['cancelled']==0]
df_clean2.drop('cancelled', axis=1, inplace=True)
df_clean2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196633 entries, 0 to 199999
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              196633 non-null  object 
 1   mkt_unique_carrier   196633 non-null  object 
 2   origin               196633 non-null  object 
 3   dest                 196633 non-null  object 
 4   crs_dep_time         196633 non-null  int64  
 5   dep_time             196633 non-null  float64
 6   dep_delay            196586 non-null  float64
 7   taxi_out             196551 non-null  float64
 8   taxi_in              196496 non-null  float64
 9   crs_arr_time         196633 non-null  int64  
 10  arr_time             196578 non-null  float64
 11  arr_delay            196160 non-null  float64
 12  diverted             196633 non-null  int64  
 13  crs_elapsed_time     196633 non-null  int64  
 14  actual_elapsed_time  196193 non-null  float64
 15  distance         

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean2.drop('cancelled', axis=1, inplace=True)


In [7]:
#Remove all the rows with diverted flights then dropping the diverted feature
df_clean3 = df_clean2[df_clean2['diverted']==0]
df_clean3.drop('diverted', axis=1, inplace=True)
df_clean3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196193 entries, 0 to 199999
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              196193 non-null  object 
 1   mkt_unique_carrier   196193 non-null  object 
 2   origin               196193 non-null  object 
 3   dest                 196193 non-null  object 
 4   crs_dep_time         196193 non-null  int64  
 5   dep_time             196193 non-null  float64
 6   dep_delay            196146 non-null  float64
 7   taxi_out             196111 non-null  float64
 8   taxi_in              196111 non-null  float64
 9   crs_arr_time         196193 non-null  int64  
 10  arr_time             196193 non-null  float64
 11  arr_delay            196160 non-null  float64
 12  crs_elapsed_time     196193 non-null  int64  
 13  actual_elapsed_time  196193 non-null  float64
 14  distance             196193 non-null  int64  
 15  carrier_delay    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean3.drop('diverted', axis=1, inplace=True)


In [8]:
#Replace the null values in taxi-in/out

'''
From the exploratory data anaysis done by the other members. It is found that values of taxi in and out are
due to small airports that does not require the taxing process. We decided to set the value to 0 for those cases. 
'''

df_clean3['taxi_out'].fillna(0, inplace=True)
df_clean3['taxi_in'].fillna(0, inplace=True)
df_clean3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196193 entries, 0 to 199999
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              196193 non-null  object 
 1   mkt_unique_carrier   196193 non-null  object 
 2   origin               196193 non-null  object 
 3   dest                 196193 non-null  object 
 4   crs_dep_time         196193 non-null  int64  
 5   dep_time             196193 non-null  float64
 6   dep_delay            196146 non-null  float64
 7   taxi_out             196193 non-null  float64
 8   taxi_in              196193 non-null  float64
 9   crs_arr_time         196193 non-null  int64  
 10  arr_time             196193 non-null  float64
 11  arr_delay            196160 non-null  float64
 12  crs_elapsed_time     196193 non-null  int64  
 13  actual_elapsed_time  196193 non-null  float64
 14  distance             196193 non-null  int64  
 15  carrier_delay    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean3['taxi_out'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean3['taxi_in'].fillna(0, inplace=True)


In [9]:
#fixing the null values in arrival and departure delay

'''
Our investigation showed that arrival delay and departure delay have a null value when the arrival/departure time
is the same as the schedule time. We decided to set those values to 0.
'''

df_clean3['dep_delay'].fillna(0, inplace=True)
df_clean3['arr_delay'].fillna(0, inplace=True)
df_clean3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196193 entries, 0 to 199999
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              196193 non-null  object 
 1   mkt_unique_carrier   196193 non-null  object 
 2   origin               196193 non-null  object 
 3   dest                 196193 non-null  object 
 4   crs_dep_time         196193 non-null  int64  
 5   dep_time             196193 non-null  float64
 6   dep_delay            196193 non-null  float64
 7   taxi_out             196193 non-null  float64
 8   taxi_in              196193 non-null  float64
 9   crs_arr_time         196193 non-null  int64  
 10  arr_time             196193 non-null  float64
 11  arr_delay            196193 non-null  float64
 12  crs_elapsed_time     196193 non-null  int64  
 13  actual_elapsed_time  196193 non-null  float64
 14  distance             196193 non-null  int64  
 15  carrier_delay    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean3['dep_delay'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean3['arr_delay'].fillna(0, inplace=True)


In [10]:
#Dropping carrier/weather/nas/security/late aircraft delay

'''
We decided that those features have too many missing values and the data for these features will not be available
when predicting future delays. The timeframe for the project is also too short to properly incorporate these features.
'''
to_drop2 = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']
df_clean4 = df_clean3.drop(to_drop2, axis=1)
df_clean4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196193 entries, 0 to 199999
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   fl_date              196193 non-null  object 
 1   mkt_unique_carrier   196193 non-null  object 
 2   origin               196193 non-null  object 
 3   dest                 196193 non-null  object 
 4   crs_dep_time         196193 non-null  int64  
 5   dep_time             196193 non-null  float64
 6   dep_delay            196193 non-null  float64
 7   taxi_out             196193 non-null  float64
 8   taxi_in              196193 non-null  float64
 9   crs_arr_time         196193 non-null  int64  
 10  arr_time             196193 non-null  float64
 11  arr_delay            196193 non-null  float64
 12  crs_elapsed_time     196193 non-null  int64  
 13  actual_elapsed_time  196193 non-null  float64
 14  distance             196193 non-null  int64  
dtypes: float64(7), in

In [11]:
#Dropping departure/arrival time and actual elapsed time

'''
Those three features will be unavailable when attempting to predict future delays, thus we decided
to remove them from the feature lists
'''

to_drop3 = ['dep_time', 'arr_time', 'actual_elapsed_time']
df_clean5 = df_clean4.drop(to_drop3, axis=1)
df_clean5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196193 entries, 0 to 199999
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   fl_date             196193 non-null  object 
 1   mkt_unique_carrier  196193 non-null  object 
 2   origin              196193 non-null  object 
 3   dest                196193 non-null  object 
 4   crs_dep_time        196193 non-null  int64  
 5   dep_delay           196193 non-null  float64
 6   taxi_out            196193 non-null  float64
 7   taxi_in             196193 non-null  float64
 8   crs_arr_time        196193 non-null  int64  
 9   arr_delay           196193 non-null  float64
 10  crs_elapsed_time    196193 non-null  int64  
 11  distance            196193 non-null  int64  
dtypes: float64(4), int64(4), object(4)
memory usage: 19.5+ MB


In [12]:
#Saving the cleaned data into a csv file before moving to feature engineering
df_clean5.to_csv('flights_cleaned.csv', index=False)