In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import seaborn as sns

In [46]:
#Pull in base dataframe and add avg departure delay
df = pd.read_csv('../CleaningTableFlights/data/model_df_version_2.csv')
dep_delay = pd.read_csv('../CleaningTableFlights/data/flights.csv')[['dep_delay','origin']]

In [47]:
#departure delay
dep_delay = dep_delay.groupby('origin').mean()
dep_delay = dep_delay.rename({'dep_delay':'average_departure_delay_by_ap'},axis=1)

In [48]:
df

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights
0,2019-02-12,DL,2070,DL,N915DN,GSP,ATL,1020,1126,66.0,153,-5.0,0
1,2019-05-25,AS,1438,AS,N844VA,LAX,SJC,2005,2130,85.0,308,53.0,0
2,2018-01-02,DL,2297,DL,N922DX,JAX,ATL,1540,1658,78.0,270,-13.0,0
3,2019-11-24,UA,4808,ZW,N419AW,IAD,ALB,1715,1854,99.0,325,-33.0,0
4,2018-07-17,AA,1815,AA,N703UW,CLT,DCA,1310,1437,87.0,331,-12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
158777,2019-11-23,B6,746,B6,N966JT,PSE,JFK,319,618,239.0,1617,22.0,0
158778,2018-04-15,UA,5994,OO,N124SY,DFW,SFO,550,800,250.0,1464,-12.0,0
158779,2018-11-27,AA,4963,PT,N603KC,RIC,PHL,1703,1814,71.0,198,18.0,0
158780,2018-10-04,AS,1387,AS,N284VA,SFO,PDX,1805,1951,106.0,550,-10.0,0


In [49]:
#add constant
df['constant'] = 1
first_column = df.pop('constant')
df.insert(0, 'constant', first_column)

In [50]:
#arr_delay data by airport
arr_delay = df[['dest','arr_delay']]
arr_delay = arr_delay.groupby('dest').mean()
arr_delay = arr_delay.rename({'arr_delay':'average_arrival_delay_by_ap_full'},axis=1)

In [51]:
arr_delay.head()

Unnamed: 0_level_0,average_arrival_delay_by_ap_full
dest,Unnamed: 1_level_1
ABE,21.464286
ABI,6.235294
ABQ,4.498246
ABR,47.0
ABY,34.1875


In [52]:
#passenger_fuel_info by carrier
pass_fuel_df = pd.read_csv('data/passenger_fuel_df_full.csv')
pass_fuel_df.head()

Unnamed: 0,mkt_unique_carrier,avg_distance_per_month_by_carrier,avg_dep_delay_by_carrier,total_gallons,passengers_by_carrier_per_month,monthly_distance_per_passenger,avgfuel_percustomer_perdistance
0,AA,119806200.0,1732355.0,529073.835069,12697770.0,0.105986,4991926.0
1,AS,34933910.0,129033.6,121682.671875,2920384.0,0.083597,1455580.0
2,B6,27338350.0,414630.5,147726.59375,3545438.0,0.129687,1139098.0
3,DL,101480200.0,1347398.0,547151.206597,13131630.0,0.129401,4228340.0
4,F9,10992850.0,177585.8,74017.159722,1776412.0,0.161597,458035.3


In [53]:
#departures info by year per airport
departures_df = pd.read_csv('data/departures_by_airport_full.csv')
departures_df = departures_df.drop('origin_city_name',axis=1)
departures_df.head()

Unnamed: 0,origin,yearly_departures_per_ap,yearly_passengers_per_ap
0,05A,75.4,149.6
1,06A,7.2,3.8
2,08A,2.0,3.0
3,09A,7.2,8.6
4,1B1,1.8,5.8


In [54]:
#arrival info by year per airport
arrivals_df = pd.read_csv('data/arrivals_by_airport.csv')
arrivals_df = arrivals_df.drop('dest_city_name',axis=1)
arrivals_df.head()

Unnamed: 0,dest,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap
0,05A,76.6,168.6
1,06A,7.0,3.6
2,08A,2.2,3.4
3,09A,7.4,13.0
4,1B1,1.2,3.2


In [55]:
#monthly destination and origin info
month_dest_df = pd.read_csv('data/features_dest_monthly_params.csv')
month_origin_df = pd.read_csv('data/features_origin_monthly_params.csv')

In [56]:
#create month and weekday variables
df['month'] = pd.DatetimeIndex(df['fl_date']).month
df['weekday'] = pd.DatetimeIndex(df['fl_date']).weekday

In [57]:
#create combo variables for grouping
month_dest_df['combo1']=month_dest_df['dest'].astype('str')+month_dest_df['fl_month'].astype('str')
df['combo1']=df['dest'].astype('str')+df['month'].astype('str')

#rename columns for clarity and drop useless columns
month_dest_df = month_dest_df.drop(['fl_month','dest'],axis=1)
month_dest_df = month_dest_df.rename(columns={'nas_delay':'avg_nas_delay_by_month_by_dest_ap','carrier_delay':'avg_carrier_delay_by_month_by_dest_ap','arr_delay':'avg_arr_delay_by_month_by_dest_ap','cancelled':'cancelled_flights_by_month_by_dest_ap','weather_delay':'avg_weather_delay_by_month_by_dest_ap','security_delay':'security_delay_by_month_by_dest_ap'})
month_dest_df.head()

Unnamed: 0,avg_arr_delay_by_month_by_dest_ap,avg_carrier_delay_by_month_by_dest_ap,avg_nas_delay_by_month_by_dest_ap,late_aircraft_delay,avg_weather_delay_by_month_by_dest_ap,security_delay_by_month_by_dest_ap,cancelled_flights_by_month_by_dest_ap,combo1
0,6.361596,4.544888,2.058603,5.75187,1.372818,0.004988,0.024938,ABE1
1,7.712614,6.074122,1.954486,6.189857,1.872562,0.0,0.027308,ABE2
2,3.069915,3.960805,2.456568,3.204449,0.309322,0.0,0.040254,ABE3
3,2.710451,2.215724,1.658677,4.325983,1.352828,0.00767,0.019175,ABE4
4,3.248503,3.714571,1.660679,3.884232,0.717565,0.003992,0.016966,ABE5


In [58]:
#create combo variables for grouping
month_origin_df['combo2']=month_origin_df['origin'].astype('str')+month_origin_df['fl_month'].astype('str')
df['combo2']=df['origin'].astype('str')+df['month'].astype('str')

#rename columns for clarity and drop useless columns
month_origin_df = month_origin_df.drop(['fl_month','origin'],axis=1)
month_origin_df = month_origin_df.rename(columns={'dep_delay':'avg_dep_delay_by_month_by_origin_ap','nas_delay':'avg_nas_delay_by_month_by_origin_ap','carrier_delay':'avg_carrier_delay_by_month_by_origin_ap','arr_delay':'avg_arr_delay_by_month_by_origin_ap','cancelled':'cancelled_flights_by_month_by_origin_ap','weather_delay':'avg_weather_delay_by_month_by_origin_ap','security_delay':'security_delay_by_month_by_origin_ap'})
month_origin_df.head()

Unnamed: 0,avg_dep_delay_by_month_by_origin_ap,avg_arr_delay_by_month_by_origin_ap,avg_carrier_delay_by_month_by_origin_ap,avg_nas_delay_by_month_by_origin_ap,late_aircraft_delay,avg_weather_delay_by_month_by_origin_ap,security_delay_by_month_by_origin_ap,cancelled_flights_by_month_by_origin_ap,combo2
0,12.18625,8.98,6.17,4.2125,5.995,0.4975,0.0025,0.02625,ABE1
1,15.41039,14.646753,5.358442,4.472727,8.574026,2.341558,0.0,0.031169,ABE2
2,7.479873,5.648305,3.481992,4.149364,5.019068,0.307203,0.020127,0.042373,ABE3
3,9.223823,7.659942,5.782901,2.552354,4.845341,0.872238,0.013449,0.01537,ABE4
4,8.562874,3.111776,2.921158,2.324351,6.011976,0.756487,0.0,0.015968,ABE5


In [59]:
#daily arrival info - unused
daily_arrivals = pd.read_csv('data/num_arrivals_daily.csv')
daily_departures = pd.read_csv('data/num_departures_daily.csv')

In [60]:
daily_arrivals.head()

Unnamed: 0,fl_date,dest,num_flights_arriving
0,2018-01-01,ABE,10
1,2018-01-01,ABI,6
2,2018-01-01,ABQ,77
3,2018-01-01,ABR,2
4,2018-01-01,ABY,2


In [61]:
#tailnumber info - unused
tailnum_df = pd.read_csv('data/features_tailnum_delay_taxi_median_params.csv')
tailnum_df = tailnum_df.rename(columns={'dep_delay':'dep_delay_by_plane','taxi_out':'taxi_out_by_plane','arr_delay':'arr_delay_by_plane','taxi_in':'taxi_in_by_plane'})

In [62]:
tailnum_df.head()

Unnamed: 0,tail_num,dep_delay_by_plane,taxi_out_by_plane,arr_delay_by_plane,taxi_in_by_plane
0,215NV,-1.0,11.0,-4.0,6.0
1,216NV,-3.0,12.0,-5.0,6.0
2,217NV,-2.0,11.0,-3.0,7.0
3,218NV,-2.0,11.0,-4.0,6.0
4,219NV,-4.0,11.0,-6.0,6.0


In [63]:
#convert time to # of minute since midnight
df['crs_arr_time'] = (df['crs_arr_time']//100)*60 + df['crs_arr_time'] - (df['crs_arr_time']//100)*100
df['crs_dep_time'] = (df['crs_dep_time']//100)*60 + df['crs_dep_time'] - (df['crs_dep_time']//100)*100

In [64]:
#add in fuel and passenger info by carrier
df = df.merge(pass_fuel_df, how = 'left', left_on='mkt_unique_carrier',right_on='mkt_unique_carrier')

In [65]:
#add in departure and passenger info by airport
df = df.merge(departures_df, how = 'left', on = 'origin')

#add in arrival and passenger info by arrival airport
df = df.merge(arrivals_df, how = 'left', on = 'dest')

#add in average departure delay by departure location
df = df.merge(dep_delay, how = 'left', left_on = 'origin', right_index = True)

#add in average arrival delay by arrival location
df = df.merge(arr_delay, how = 'left', left_on = 'origin', right_index = True)

#add in monthly data
df = df.merge(month_dest_df, how = 'left', on = 'combo1')
df = df.merge(month_origin_df, how = 'left', on = 'combo2')

#Below excluded because their addition INCREASED RMSE
# #add in departures & arrivals per day
# df = df.merge(daily_arrivals, how = 'left', on = ['fl_date','dest'])
# df = df.merge(daily_departures, how = 'left', on = ['fl_date','origin'])

# #add in plane level info
# df = df.merge(tailnum_df, how='left', on = 'tail_num')

In [66]:
df = df.drop(['fl_date','tail_num','mkt_carrier_fl_num'], axis = 1)

In [67]:
#create variable where 1 if op_carrier different from mkt_carrier
df['op_unique_carrier_check'] = df['op_unique_carrier'] != df['mkt_unique_carrier']
df['op_unique_carrier_check'] = df['op_unique_carrier_check'].astype('int')

In [68]:
#fill arrival/departure average nans with 0
df = df.fillna(0)

In [69]:
df = df.drop(['origin','dest','op_unique_carrier','combo1','combo2','cancelled_flights'],axis=1)

In [70]:
df.columns

Index(['constant', 'mkt_unique_carrier', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'distance', 'arr_delay', 'month', 'weekday',
       'avg_distance_per_month_by_carrier', 'avg_dep_delay_by_carrier',
       'total_gallons', 'passengers_by_carrier_per_month',
       'monthly_distance_per_passenger', 'avgfuel_percustomer_perdistance',
       'yearly_departures_per_ap', 'yearly_passengers_per_ap',
       'yearly_arrivals_per_ap', 'yearly_arriving_passengers_per_ap',
       'average_departure_delay_by_ap', 'average_arrival_delay_by_ap_full',
       'avg_arr_delay_by_month_by_dest_ap',
       'avg_carrier_delay_by_month_by_dest_ap',
       'avg_nas_delay_by_month_by_dest_ap', 'late_aircraft_delay_x',
       'avg_weather_delay_by_month_by_dest_ap',
       'security_delay_by_month_by_dest_ap',
       'cancelled_flights_by_month_by_dest_ap',
       'avg_dep_delay_by_month_by_origin_ap',
       'avg_arr_delay_by_month_by_origin_ap',
       'avg_carrier_delay_by_month_by_origin_

In [71]:
df.head()

Unnamed: 0,constant,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,month,weekday,avg_distance_per_month_by_carrier,...,cancelled_flights_by_month_by_dest_ap,avg_dep_delay_by_month_by_origin_ap,avg_arr_delay_by_month_by_origin_ap,avg_carrier_delay_by_month_by_origin_ap,avg_nas_delay_by_month_by_origin_ap,late_aircraft_delay_y,avg_weather_delay_by_month_by_origin_ap,security_delay_by_month_by_origin_ap,cancelled_flights_by_month_by_origin_ap,op_unique_carrier_check
0,1,DL,620,686,66.0,153,-5.0,2,1,101480200.0,...,0.006185,11.312448,4.526793,3.995054,3.525556,6.062242,0.645507,0.0,0.028854,0
1,1,AS,1205,1290,85.0,308,53.0,5,5,34933910.0,...,0.019439,8.986239,1.990982,3.388559,2.506986,4.166428,0.081394,0.010356,0.010262,0
2,1,DL,940,1018,78.0,270,-13.0,1,1,101480200.0,...,0.005505,9.814683,3.909325,3.649802,3.181944,5.326984,0.321429,0.0,0.012103,0
3,1,UA,1035,1134,99.0,325,-33.0,11,6,101368600.0,...,0.014629,9.555175,5.925672,4.845193,2.577689,5.778856,0.577055,0.0,0.00723,1
4,1,AA,790,877,87.0,331,-12.0,7,1,119806200.0,...,0.037967,14.977925,11.530916,4.291598,3.141092,7.255694,1.582681,0.049149,0.021958,0


In [72]:
dummy_df = pd.get_dummies(df,columns=['mkt_unique_carrier','month','weekday'])

In [73]:
#drop final category of each dummy variable
dummy_df = dummy_df.drop(columns=['month_12','weekday_6','mkt_unique_carrier_WN'])

In [74]:
X = dummy_df.drop('arr_delay',axis=1)
y = dummy_df['arr_delay']

In [75]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [76]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(127035, 60) (31759, 60) (127035,) (31759,)


XGBoost Regression Model

In [77]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error,r2_score

In [78]:
model = xgb.XGBRegressor(random_state = 13)

In [79]:
parameters = {'nthread':[4], 
              'objective':['reg:squarederror'],
              'learning_rate': [.01, .03, 0.05, .07, .09], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

In [80]:
# xgb_grid = GridSearchCV(model, parameters, cv=3, n_jobs = 5, verbose = True)

# xgb_grid.fit(X_train, y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [81]:
model = xgb.XGBRegressor(colsample_bytree = .7, 
                         learning_rate = .01, 
                         max_depth = 5, 
                         min_child_weight = 4, 
                         n_estimators = 500, 
                         nthread=4, 
                         objective = 'reg:squarederror', 
                         subsample = .7, random_state=13, 
                         reg_lambda = 100, 
                         )

In [82]:
model.fit(X_train,y_train)

In [83]:
score = model.score(X_train,y_train)
print("Training Scores: ", score)

Training Scores:  0.04778163682373815


In [84]:
feature_scores_rf = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)

feature_scores_rf.head(10)

avg_arr_delay_by_month_by_dest_ap      0.061509
avg_arr_delay_by_month_by_origin_ap    0.057562
crs_dep_time                           0.054459
crs_arr_time                           0.032173
avg_dep_delay_by_month_by_origin_ap    0.030086
avg_nas_delay_by_month_by_dest_ap      0.029021
passengers_by_carrier_per_month        0.025625
late_aircraft_delay_y                  0.024661
mkt_unique_carrier_UA                  0.022746
total_gallons                          0.022161
dtype: float32

In [85]:
test_score = model.score(X_test,y_test)
print("Testing Scores: ", test_score)

Testing Scores:  0.029150604783334977


In [86]:

ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 2339.24
RMSE: 48.37


In [87]:
score = model.score(X,y)
print("Total Scores: ", score)

Total Scores:  0.04417675292434464


In [88]:
ypred = model.predict(X)
mse = mean_squared_error(y, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 2380.56
RMSE: 48.79
