In [66]:
import warnings
warnings.filterwarnings("ignore")

#import required libraries
import pandas as pd
import numpy as np
import datetime

from sklearn.preprocessing import StandardScaler

In [67]:
#Read the flights
df = pd.read_csv('../CleaningTableFlights/data/model_df_version_1.csv')
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights
0,2019-09-23,AA,4847,PT,N646AE,CAE,CLT,1949,2055,66.0,88,0.0,1
1,2019-05-18,AA,5983,YV,N959LR,DFW,HOU,1630,1740,70.0,247,0.0,1
2,2018-05-15,AA,5002,PT,N625AE,PHL,DAY,1540,1736,116.0,477,0.0,1
3,2019-02-27,WN,1140,WN,N462WN,SJC,SAN,955,1110,75.0,417,0.0,1
4,2019-06-29,UA,4380,EV,N13553,EWR,GSO,1000,1145,105.0,445,0.0,1


In [68]:
pass_fuel_df = pd.read_csv('../EDA_questions/data/passenger_fuel_df.csv')

In [69]:
arrivals_df = pd.read_csv('../EDA_questions/data/arrivals_by_airport.csv')
arrivals_df = arrivals_df.drop('dest_city_name',axis=1)

In [37]:
arr_delay = df[['dest','arr_delay']]
arr_delay = arr_delay.groupby('dest').mean()
arr_delay = arr_delay.rename({'arr_delay':'average_arrival_delay_by_ap'},axis=1)
arr_delay

Unnamed: 0_level_0,average_arrival_delay_by_ap
dest,Unnamed: 1_level_1
ABE,8.855422
ABI,1.906250
ABQ,4.626506
ABR,0.000000
ABY,1.000000
...,...
WYS,-13.000000
XNA,1.271676
YAK,-7.000000
YKM,-0.157895


In [38]:
delay = []
for i in df['arr_delay']:
    if i > 0:
        delay.append(1)
    else:
        delay.append(0)# no delay
df['flight_delay']= delay
df.head(2)

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights,flight_delay
0,2019-09-23,AA,4847,PT,N646AE,CAE,CLT,1949,2055,66.0,88,0.0,1,0
1,2019-05-18,AA,5983,YV,N959LR,DFW,HOU,1630,1740,70.0,247,0.0,1,0


In [39]:
# Function to convert integer format HM to time
def convert_to_secunds(time_int):
    time_str = str(time_int).zfill(4)
    hour = int(time_str[:-2])
    minute = int(time_str[-2:])
    if hour >= 24:
      hour %= 24
    total_seconds = hour * 3600 + minute * 60
    return total_seconds

In [40]:
def convert_to_time(time_int):
    time_str = str(time_int).zfill(4)
    hour = int(time_str[:-2])
    minute = int(time_str[-2:])
    if hour >= 24:
        hour %= 24
    return "{:02d}:{:02d}".format(hour, minute)

In [41]:
df['crs_dep_time_convert'] = df['crs_dep_time'].apply(convert_to_secunds)

In [42]:
df['crs_arr_time_convert'] = df['crs_arr_time'].apply(convert_to_secunds)

In [43]:
df['duration'] = df['crs_elapsed_time']

In [44]:
df['average_speed'] = (df['distance'].round(2) / df['duration'].round(2)).round(2)

In [45]:
def get_month_and_day_of_week(fl_date):
    date_obj = datetime.datetime.strptime(fl_date, '%Y-%m-%d')
    month = date_obj.month
    day_of_week = date_obj.strftime('%A')
    day_of_week_number = date_obj.isoweekday()
    return month, day_of_week_number

In [46]:
df['month'], df['day_of_week'] = zip(*df['fl_date'].apply(get_month_and_day_of_week))

In [47]:
#dropping variables that weren't significant off the bat (high multicollinearity)
pass_fuel_df = pass_fuel_df.drop(columns=['avg_distance_per_month_by_carrier','total_gallons','passengers_by_carrier_per_month','monthly_distance_per_passenger'])

In [48]:
#add in fuel and passenger info by carrier
df = df.merge(pass_fuel_df, how = 'left', left_on='mkt_unique_carrier',right_on='mkt_unique_carrier')

In [49]:
#add in arrival and passenger info by arrival airport
df = df.merge(arrivals_df, how = 'left', on = 'dest')

#add in average arrival delay by arrival location
df = df.merge(arr_delay, how = 'left', left_on = 'origin', right_index = True)

In [50]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,...,crs_arr_time_convert,duration,average_speed,month,day_of_week,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_arrival_delay_by_ap
0,2019-09-23,AA,4847,PT,N646AE,CAE,CLT,1949,2055,66.0,...,75300,66.0,1.33,9,1,17462.958333,588814400.0,256747.8,22632806.6,7.741497
1,2019-05-18,AA,5983,YV,N959LR,DFW,HOU,1630,1740,70.0,...,63600,70.0,3.53,5,6,17462.958333,588814400.0,60534.4,6831562.6,2.581706
2,2018-05-15,AA,5002,PT,N625AE,PHL,DAY,1540,1736,116.0,...,63360,116.0,4.11,5,2,17462.958333,588814400.0,17636.8,942524.6,2.73047
3,2019-02-27,WN,1140,WN,N462WN,SJC,SAN,955,1110,75.0,...,40200,75.0,5.56,2,3,11777.75,218335100.0,95973.2,11371940.8,1.048227
4,2019-06-29,UA,4380,EV,N13553,EWR,GSO,1000,1145,105.0,...,42300,105.0,4.24,6,6,17035.25,640502100.0,19290.4,920324.6,8.625


In [51]:
# missing values percent in columns
def missing_values_table(df):
        '''function that checks dataframe for missing values and returns a table with the results'''
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        print(mis_val_table_ren_columns)
        
missing_values_table(df)

                                   Missing Values  % of Total Values
fl_date                                         0           0.000000
mkt_unique_carrier                              0           0.000000
mkt_carrier_fl_num                              0           0.000000
op_unique_carrier                               0           0.000000
tail_num                                        0           0.000000
origin                                          0           0.000000
dest                                            0           0.000000
crs_dep_time                                    0           0.000000
crs_arr_time                                    0           0.000000
crs_elapsed_time                                0           0.000000
distance                                        0           0.000000
arr_delay                                       0           0.000000
cancelled_flights                               0           0.000000
flight_delay                      

In [52]:
#fill arrival/departure average nans with 0
df = df.fillna(0)

In [53]:
missing_values_table(df)

                                   Missing Values  % of Total Values
fl_date                                         0                0.0
mkt_unique_carrier                              0                0.0
mkt_carrier_fl_num                              0                0.0
op_unique_carrier                               0                0.0
tail_num                                        0                0.0
origin                                          0                0.0
dest                                            0                0.0
crs_dep_time                                    0                0.0
crs_arr_time                                    0                0.0
crs_elapsed_time                                0                0.0
distance                                        0                0.0
arr_delay                                       0                0.0
cancelled_flights                               0                0.0
flight_delay                      

In [54]:
columns_to_drop = ['fl_date',
                   'mkt_carrier_fl_num',
                   'op_unique_carrier',
                   'tail_num',
                   'origin',
                   'dest',
                   'crs_elapsed_time',
                   'crs_dep_time',
                   'crs_arr_time',
                   ] 
df = df.drop(columns=columns_to_drop)
df.head()

Unnamed: 0,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights,flight_delay,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,month,day_of_week,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_arrival_delay_by_ap
0,AA,1949,2055,66.0,88,0.0,1,0,71340,75300,66.0,1.33,9,1,17462.958333,588814400.0,256747.8,22632806.6,7.741497
1,AA,1630,1740,70.0,247,0.0,1,0,59400,63600,70.0,3.53,5,6,17462.958333,588814400.0,60534.4,6831562.6,2.581706
2,AA,1540,1736,116.0,477,0.0,1,0,56400,63360,116.0,4.11,5,2,17462.958333,588814400.0,17636.8,942524.6,2.73047
3,WN,955,1110,75.0,417,0.0,1,0,35700,40200,75.0,5.56,2,3,11777.75,218335100.0,95973.2,11371940.8,1.048227
4,UA,1000,1145,105.0,445,0.0,1,0,36000,42300,105.0,4.24,6,6,17035.25,640502100.0,19290.4,920324.6,8.625


In [55]:
print(df.dtypes)

mkt_unique_carrier                    object
crs_dep_time                           int64
crs_arr_time                           int64
crs_elapsed_time                     float64
distance                               int64
arr_delay                            float64
cancelled_flights                      int64
flight_delay                           int64
crs_dep_time_convert                   int64
crs_arr_time_convert                   int64
duration                             float64
average_speed                        float64
month                                  int64
day_of_week                            int64
avg_dep_delay_by_carrier             float64
avgfuel_percustomer_perdistance      float64
yearly_arrivals_per_ap               float64
yearly_arriving_passengers_per_ap    float64
average_arrival_delay_by_ap          float64
dtype: object


In [56]:
num_col = [ 'distance', 'arr_delay','flight_delay','cancelled_flights','crs_dep_time_convert','crs_arr_time_convert','duration','average_speed',
           'avg_dep_delay_by_carrier','avgfuel_percustomer_perdistance','yearly_arrivals_per_ap','yearly_arriving_passengers_per_ap','average_arrival_delay_by_ap']
df_num = df[num_col]

In [57]:
df_num.describe()

Unnamed: 0,distance,arr_delay,flight_delay,cancelled_flights,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_arrival_delay_by_ap
count,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0,90938.0
mean,694.666685,3.041996,0.192142,0.450175,49902.478612,55248.278168,130.230179,4.766363,13605.122916,449870000.0,162353.738895,16623340.0,3.044945
std,541.155769,36.685268,0.393986,0.497514,17864.872027,18928.71618,66.216644,1.464277,5135.195189,208606100.0,129555.100408,13881110.0,2.937825
min,16.0,-81.0,0.0,0.0,60.0,0.0,-31.0,-10.71,182.666667,20397330.0,133.4,3646.4,-13.307692
25%,309.0,-8.0,0.0,0.0,34560.0,40320.0,85.0,3.72,11777.75,218335100.0,52838.0,3804419.0,1.666877
50%,549.0,0.0,0.0,0.0,50700.0,56880.0,113.0,4.84,17035.25,588814400.0,157078.2,14825830.0,2.73047
75%,919.0,0.0,0.0,1.0,64800.0,70800.0,158.0,5.86,17462.958333,588814400.0,225690.8,23521080.0,4.078277
max,5095.0,1467.0,1.0,1.0,86340.0,86340.0,1509.0,8.94,17462.958333,2293246000.0,435839.8,51557300.0,48.428571


In [60]:
#Use StandardScaler for numerical values

scaler = StandardScaler()

num_col = [ 'distance', 'arr_delay','crs_dep_time_convert','crs_arr_time_convert','duration','average_speed',
           'avg_dep_delay_by_carrier','avgfuel_percustomer_perdistance','yearly_arrivals_per_ap','yearly_arriving_passengers_per_ap','average_arrival_delay_by_ap']
df_num = df[num_col]
df_cat = df.drop(num_col, axis=1)
df_num_scaled = pd.DataFrame(scaler.fit_transform(df_num), columns=df_num.columns)
df_scaled = df_num_scaled.join(df_cat)
df_scaled.head()

Unnamed: 0,distance,arr_delay,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_arrival_delay_by_ap,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,cancelled_flights,flight_delay,month,day_of_week
0,-1.121064,-0.082922,1.199988,1.059334,-0.970006,-2.346811,0.751258,0.666065,0.728606,0.432927,1.598658,AA,1949,2055,66.0,1,0,9,1
1,-0.827246,-0.082922,0.531634,0.441222,-0.909598,-0.844355,0.751258,0.666065,-0.78592,-0.705407,-0.157682,AA,1630,1740,70.0,1,0,5,6
2,-0.402228,-0.082922,0.363706,0.428543,-0.214905,-0.448253,0.751258,0.666065,-1.117036,-1.129657,-0.107044,AA,1540,1736,116.0,1,0,5,2
3,-0.513102,-0.082922,-0.794999,-0.795002,-0.834088,0.542002,-0.355855,-1.10992,-0.512376,-0.378314,-0.679662,WN,955,1110,75.0,1,0,2,3
4,-0.461361,-0.082922,-0.778206,-0.684059,-0.381027,-0.359471,0.667968,0.913843,-1.104272,-1.131256,1.899393,UA,1000,1145,105.0,1,0,6,6


In [61]:
#setting up dummy variables for the categories

df_encoded = pd.get_dummies(df_scaled, columns=['month','day_of_week'])
df_encoded.head()

Unnamed: 0,distance,arr_delay,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,...,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,-1.121064,-0.082922,1.199988,1.059334,-0.970006,-2.346811,0.751258,0.666065,0.728606,0.432927,...,0,0,0,1,0,0,0,0,0,0
1,-0.827246,-0.082922,0.531634,0.441222,-0.909598,-0.844355,0.751258,0.666065,-0.78592,-0.705407,...,0,0,0,0,0,0,0,0,1,0
2,-0.402228,-0.082922,0.363706,0.428543,-0.214905,-0.448253,0.751258,0.666065,-1.117036,-1.129657,...,0,0,0,0,1,0,0,0,0,0
3,-0.513102,-0.082922,-0.794999,-0.795002,-0.834088,0.542002,-0.355855,-1.10992,-0.512376,-0.378314,...,0,0,0,0,0,1,0,0,0,0
4,-0.461361,-0.082922,-0.778206,-0.684059,-0.381027,-0.359471,0.667968,0.913843,-1.104272,-1.131256,...,0,0,0,0,0,0,0,0,1,0


In [62]:
print(df_encoded.dtypes)

distance                             float64
arr_delay                            float64
crs_dep_time_convert                 float64
crs_arr_time_convert                 float64
duration                             float64
average_speed                        float64
avg_dep_delay_by_carrier             float64
avgfuel_percustomer_perdistance      float64
yearly_arrivals_per_ap               float64
yearly_arriving_passengers_per_ap    float64
average_arrival_delay_by_ap          float64
mkt_unique_carrier                    object
crs_dep_time                           int64
crs_arr_time                           int64
crs_elapsed_time                     float64
cancelled_flights                      int64
flight_delay                           int64
month_1                                uint8
month_2                                uint8
month_3                                uint8
month_4                                uint8
month_5                                uint8
month_6   

In [63]:
df_encoded = df_encoded.drop(columns=['mkt_unique_carrier'])
df_encoded.head()

Unnamed: 0,distance,arr_delay,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,...,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,-1.121064,-0.082922,1.199988,1.059334,-0.970006,-2.346811,0.751258,0.666065,0.728606,0.432927,...,0,0,0,1,0,0,0,0,0,0
1,-0.827246,-0.082922,0.531634,0.441222,-0.909598,-0.844355,0.751258,0.666065,-0.78592,-0.705407,...,0,0,0,0,0,0,0,0,1,0
2,-0.402228,-0.082922,0.363706,0.428543,-0.214905,-0.448253,0.751258,0.666065,-1.117036,-1.129657,...,0,0,0,0,1,0,0,0,0,0
3,-0.513102,-0.082922,-0.794999,-0.795002,-0.834088,0.542002,-0.355855,-1.10992,-0.512376,-0.378314,...,0,0,0,0,0,1,0,0,0,0
4,-0.461361,-0.082922,-0.778206,-0.684059,-0.381027,-0.359471,0.667968,0.913843,-1.104272,-1.131256,...,0,0,0,0,0,0,0,0,1,0


In [64]:
df_encoded.to_csv('data/EDA_for_models.csv', index=False)
pd.read_csv('data/EDA_for_models.csv').head()

Unnamed: 0,distance,arr_delay,crs_dep_time_convert,crs_arr_time_convert,duration,average_speed,avg_dep_delay_by_carrier,avgfuel_percustomer_perdistance,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,...,month_10,month_11,month_12,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,-1.121064,-0.082922,1.199988,1.059334,-0.970006,-2.346811,0.751258,0.666065,0.728606,0.432927,...,0,0,0,1,0,0,0,0,0,0
1,-0.827246,-0.082922,0.531634,0.441222,-0.909598,-0.844355,0.751258,0.666065,-0.78592,-0.705407,...,0,0,0,0,0,0,0,0,1,0
2,-0.402228,-0.082922,0.363706,0.428543,-0.214905,-0.448253,0.751258,0.666065,-1.117036,-1.129657,...,0,0,0,0,1,0,0,0,0,0
3,-0.513102,-0.082922,-0.794999,-0.795002,-0.834088,0.542002,-0.355855,-1.10992,-0.512376,-0.378314,...,0,0,0,0,0,1,0,0,0,0
4,-0.461361,-0.082922,-0.778206,-0.684059,-0.381027,-0.359471,0.667968,0.913843,-1.104272,-1.131256,...,0,0,0,0,0,0,0,0,1,0
