In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [108]:
df = pd.read_csv('../CleaningTableFlights/data/model_df_version_2.csv')
dep_delay = pd.read_csv('../CleaningTableFlights/data/flights.csv')[['dep_delay','origin']]

In [109]:
dep_delay = dep_delay.groupby('origin').mean()
dep_delay = dep_delay.rename({'dep_delay':'average_departure_delay_by_ap'},axis=1)

In [110]:
df

Unnamed: 0,fl_date,mkt_unique_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights
0,2019-02-12,DL,2070,DL,N915DN,GSP,ATL,1020,1126,66.0,153,-5.0,0
1,2019-05-25,AS,1438,AS,N844VA,LAX,SJC,2005,2130,85.0,308,53.0,0
2,2018-01-02,DL,2297,DL,N922DX,JAX,ATL,1540,1658,78.0,270,-13.0,0
3,2019-11-24,UA,4808,ZW,N419AW,IAD,ALB,1715,1854,99.0,325,-33.0,0
4,2018-07-17,AA,1815,AA,N703UW,CLT,DCA,1310,1437,87.0,331,-12.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
158777,2019-11-23,B6,746,B6,N966JT,PSE,JFK,319,618,239.0,1617,22.0,0
158778,2018-04-15,UA,5994,OO,N124SY,DFW,SFO,550,800,250.0,1464,-12.0,0
158779,2018-11-27,AA,4963,PT,N603KC,RIC,PHL,1703,1814,71.0,198,18.0,0
158780,2018-10-04,AS,1387,AS,N284VA,SFO,PDX,1805,1951,106.0,550,-10.0,0


In [111]:
arr_delay = df[['dest','arr_delay']]
arr_delay = arr_delay.groupby('dest').mean()
arr_delay = arr_delay.rename({'arr_delay':'average_arrival_delay_by_ap_full'},axis=1)

In [112]:
df = df.drop(['tail_num','mkt_carrier_fl_num'], axis = 1)

In [113]:
pass_fuel_df = pd.read_csv('data/passenger_fuel_df_full.csv')

In [114]:
departures_df = pd.read_csv('data/departures_by_airport_full.csv')
departures_df = departures_df.drop('origin_city_name',axis=1)

In [115]:
arrivals_df = pd.read_csv('data/arrivals_by_airport.csv')
arrivals_df = arrivals_df.drop('dest_city_name',axis=1)

In [116]:
month_dest_df = pd.read_csv('data/features_dest_monthly_params.csv')
month_origin_df = pd.read_csv('data/features_origin_monthly_params.csv')

In [117]:
df['month'] = pd.DatetimeIndex(df['fl_date']).month
df['weekday'] = pd.DatetimeIndex(df['fl_date']).weekday

In [118]:
month_dest_df['combo1']=month_dest_df['dest'].astype('str')+month_dest_df['fl_month'].astype('str')
df['combo1']=df['dest'].astype('str')+df['month'].astype('str')

month_dest_df = month_dest_df.drop(['fl_month','dest'],axis=1)
month_dest_df = month_dest_df.rename(columns={'arr_delay':'avg_arr_delay_by_month_by_dest_ap','cancelled':'cancelled_flights_by_month_by_dest_ap','weather_delay':'avg_weather_delay_by_month_by_dest_ap','security_delay':'security_delay_by_month_by_dest_ap'})

In [119]:
month_origin_df['combo2']=month_origin_df['origin'].astype('str')+month_origin_df['fl_month'].astype('str')
df['combo2']=df['origin'].astype('str')+df['month'].astype('str')

month_origin_df = month_origin_df.drop(['fl_month','origin'],axis=1)
month_origin_df = month_origin_df.rename(columns={'arr_delay':'avg_arr_delay_by_month_by_origin_ap','cancelled':'cancelled_flights_by_month_by_origin_ap','weather_delay':'avg_weather_delay_by_month_by_origin_ap','security_delay':'security_delay_by_month_by_origin_ap'})

In [120]:
df = df.drop(['fl_date'],axis=1)

In [121]:
df['crs_arr_time'] = (df['crs_arr_time']//100)*60 + df['crs_arr_time'] - (df['crs_arr_time']//100)*100
df['crs_dep_time'] = (df['crs_dep_time']//100)*60 + df['crs_dep_time'] - (df['crs_dep_time']//100)*100

In [122]:
#add in fuel and passenger info by carrier
df = df.merge(pass_fuel_df, how = 'left', left_on='mkt_unique_carrier',right_on='mkt_unique_carrier')

In [123]:
#add in departure and passenger info by airport
df = df.merge(departures_df, how = 'left', on = 'origin')

#add in arrival and passenger info by arrival airport
df = df.merge(arrivals_df, how = 'left', on = 'dest')

#add in average departure delay by departure location
df = df.merge(dep_delay, how = 'left', left_on = 'origin', right_index = True)

#add in average arrival delay by arrival location
df = df.merge(arr_delay, how = 'left', left_on = 'origin', right_index = True)

#add in monthly data
df = df.merge(month_dest_df, how = 'left', on = 'combo1')
df = df.merge(month_origin_df, how = 'left', on = 'combo2')

In [124]:
#add constant
df['constant'] = 1
first_column = df.pop('constant')
df.insert(0, 'constant', first_column)

In [125]:
#create variable where 1 if op_carrier different from mkt_carrier
df['op_unique_carrier_check'] = df['op_unique_carrier'] != df['mkt_unique_carrier']
df['op_unique_carrier_check'] = df['op_unique_carrier_check'].astype('int')

In [126]:
#fill arrival/departure average nans with 0
df = df.fillna(0)

In [127]:
df = df.drop(['origin','dest','op_unique_carrier','combo1','combo2','cancelled_flights'],axis=1)

In [128]:
df.columns

Index(['constant', 'mkt_unique_carrier', 'crs_dep_time', 'crs_arr_time',
       'crs_elapsed_time', 'distance', 'arr_delay', 'month', 'weekday',
       'avg_distance_per_month_by_carrier', 'avg_dep_delay_by_carrier',
       'total_gallons', 'passengers_by_carrier_per_month',
       'monthly_distance_per_passenger', 'avgfuel_percustomer_perdistance',
       'yearly_departures_per_ap', 'yearly_passengers_per_ap',
       'yearly_arrivals_per_ap', 'yearly_arriving_passengers_per_ap',
       'average_departure_delay_by_ap', 'average_arrival_delay_by_ap_full',
       'avg_arr_delay_by_month_by_dest_ap', 'carrier_delay_x', 'nas_delay_x',
       'late_aircraft_delay_x', 'avg_weather_delay_by_month_by_dest_ap',
       'security_delay_by_month_by_dest_ap',
       'cancelled_flights_by_month_by_dest_ap', 'dep_delay',
       'avg_arr_delay_by_month_by_origin_ap', 'carrier_delay_y', 'nas_delay_y',
       'late_aircraft_delay_y', 'avg_weather_delay_by_month_by_origin_ap',
       'security_delay_by_m

In [129]:
df.head()

Unnamed: 0,constant,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,month,weekday,avg_distance_per_month_by_carrier,...,cancelled_flights_by_month_by_dest_ap,dep_delay,avg_arr_delay_by_month_by_origin_ap,carrier_delay_y,nas_delay_y,late_aircraft_delay_y,avg_weather_delay_by_month_by_origin_ap,security_delay_by_month_by_origin_ap,cancelled_flights_by_month_by_origin_ap,op_unique_carrier_check
0,1,DL,620,686,66.0,153,-5.0,2,1,101480200.0,...,0.006185,11.312448,4.526793,3.995054,3.525556,6.062242,0.645507,0.0,0.028854,0
1,1,AS,1205,1290,85.0,308,53.0,5,5,34933910.0,...,0.019439,8.986239,1.990982,3.388559,2.506986,4.166428,0.081394,0.010356,0.010262,0
2,1,DL,940,1018,78.0,270,-13.0,1,1,101480200.0,...,0.005505,9.814683,3.909325,3.649802,3.181944,5.326984,0.321429,0.0,0.012103,0
3,1,UA,1035,1134,99.0,325,-33.0,11,6,101368600.0,...,0.014629,9.555175,5.925672,4.845193,2.577689,5.778856,0.577055,0.0,0.00723,1
4,1,AA,790,877,87.0,331,-12.0,7,1,119806200.0,...,0.037967,14.977925,11.530916,4.291598,3.141092,7.255694,1.582681,0.049149,0.021958,0


In [130]:
dummy_df = pd.get_dummies(df,columns=['mkt_unique_carrier','month','weekday'])

In [131]:
#drop final category of each dummy variable
dummy_df = dummy_df.drop(columns=['month_12','weekday_6','mkt_unique_carrier_WN'])

In [132]:
X = dummy_df.drop('arr_delay',axis=1)
y = dummy_df['arr_delay']

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [134]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(127035, 60) (31759, 60) (127035,) (31759,)


XGBoost Regression Model

In [135]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

In [136]:
model = xgb.XGBRegressor(random_state = 13)

In [137]:
parameters = {'nthread':[4], 
              'objective':['reg:squarederror'],
              'learning_rate': [.01, .03, 0.05, .07, .09], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500],
              'alpha': [0,.1,.25,.5,1,2,5]}

In [138]:
# xgb_grid = GridSearchCV(model, parameters, cv=3, n_jobs = 5, verbose = True)

# xgb_grid.fit(X_train, y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [185]:
model = xgb.XGBRegressor(colsample_bytree = .7, 
                         learning_rate = .01, 
                         max_depth = 5, 
                         min_child_weight = 4, 
                         n_estimators = 500, 
                         nthread=4, 
                         objective = 'reg:squarederror', 
                         subsample = .5, random_state=13, 
                         reg_lambda = 50, 
                         )

In [186]:
model.fit(X_train,y_train)

In [187]:
score = model.score(X_train,y_train)
print("Training Scores: ", score)

Training Scores:  0.05010389022846751


In [188]:
test_score = model.score(X_test,y_test)
print("Testing Scores: ", test_score)

Testing Scores:  0.02803082909344634


In [189]:
#so far slight improvement over linear model
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 2410.18
RMSE: 49.09


In [190]:
score = model.score(X,y)
print("Total Scores: ", score)

Total Scores:  0.04571008677214217


In [191]:
ypred = model.predict(X)
mse = mean_squared_error(y, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 2376.74
RMSE: 48.75
