In [156]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [168]:
df = pd.read_csv('../CleaningTableFlights/data/model_df_version_1.csv')
dep_delay = pd.read_csv('../CleaningTableFlights/data/flights.csv')[['dep_delay','origin']]

In [169]:
dep_delay = dep_delay.groupby('origin').mean()
dep_delay = dep_delay.rename({'dep_delay':'average_departure_delay_by_ap'},axis=1)

In [170]:
arr_delay = df[['dest','arr_delay']]
arr_delay = arr_delay.groupby('dest').mean()
arr_delay = arr_delay.rename({'arr_delay':'average_arrival_delay_by_ap'},axis=1)

In [171]:
df = df.drop(['tail_num','mkt_carrier_fl_num'], axis = 1)

In [172]:
pass_fuel_df = pd.read_csv('../EDA_questions/data/passenger_fuel_df.csv')

In [173]:
departures_df = pd.read_csv('../EDA_questions/data/departures_by_airport.csv')
departures_df = departures_df.drop('origin_city_name',axis=1)

In [174]:
arrivals_df = pd.read_csv('../EDA_questions/data/arrivals_by_airport.csv')
arrivals_df = arrivals_df.drop('dest_city_name',axis=1)

In [175]:
month_df = pd.read_csv('data/features_dest_monthly_params.csv')


In [176]:
df['month'] = pd.DatetimeIndex(df['fl_date']).month
df['weekday'] = pd.DatetimeIndex(df['fl_date']).weekday

In [177]:
month_df['combo']=month_df['dest'].astype('str')+month_df['fl_month'].astype('str')
df['combo']=df['dest'].astype('str')+df['month'].astype('str')

month_df = month_df.drop(['fl_month','dest'],axis=1)
month_df = month_df.rename(columns={'arr_delay':'avg_arr_delay_by_month_by_ap','cancelled':'cancelled_flights_by_month_by_airport','weather_delay':'avg_weather_delay_by_month_by_ap','security_delay':'security_delay_by_month_by_ap'})

In [178]:
df = df.drop(['fl_date'],axis=1)

In [179]:
df['crs_arr_time'] = (df['crs_arr_time']//100)*60 + df['crs_arr_time'] - (df['crs_arr_time']//100)*100
df['crs_dep_time'] = (df['crs_dep_time']//100)*60 + df['crs_dep_time'] - (df['crs_dep_time']//100)*100

In [180]:
#add in fuel and passenger info by carrier
df = df.merge(pass_fuel_df, how = 'left', left_on='mkt_unique_carrier',right_on='mkt_unique_carrier')

In [181]:
#add in departure and passenger info by airport
df = df.merge(departures_df, how = 'left', on = 'origin')

#add in arrival and passenger info by arrival airport
df = df.merge(arrivals_df, how = 'left', on = 'dest')

#add in average departure delay by departure location
df = df.merge(dep_delay, how = 'left', left_on = 'origin', right_index = True)

#add in average arrival delay by arrival location
df = df.merge(arr_delay, how = 'left', left_on = 'origin', right_index = True)

#add in monthly data
df = df.merge(month_df, how = 'left', on = 'combo')

In [182]:
#add constant
df['constant'] = 1
first_column = df.pop('constant')
df.insert(0, 'constant', first_column)

In [183]:
#create variable where 1 if op_carrier different from mkt_carrier
df['op_unique_carrier_check'] = df['op_unique_carrier'] != df['mkt_unique_carrier']
df['op_unique_carrier_check'] = df['op_unique_carrier_check'].astype('int')

In [184]:
#fill arrival/departure average nans with 0
df = df.fillna(0)

In [185]:
df

Unnamed: 0,constant,mkt_unique_carrier,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,...,yearly_passengers_per_ap,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_departure_delay_by_ap,average_arrival_delay_by_ap,avg_arr_delay_by_month_by_ap,cancelled_flights_by_month_by_airport,avg_weather_delay_by_month_by_ap,security_delay_by_month_by_ap,op_unique_carrier_check
0,1,DL,DL,GSP,ATL,620,686,66.0,153,-5.0,...,1092760.2,435839.8,51557297.6,12.627358,6.142405,1.770915,0.006184,0.707568,0.018977,0
1,1,AS,AS,LAX,SJC,1205,1290,85.0,308,53.0,...,41149051.0,60583.2,6336858.6,8.952404,2.602834,3.435302,0.019439,0.146229,0.009720,0
2,1,DL,DL,JAX,ATL,940,1018,78.0,270,-13.0,...,2985380.0,435839.8,51557297.6,6.891824,7.182946,1.034708,0.005504,1.010663,0.004603,0
3,1,UA,ZW,IAD,ALB,1035,1134,99.0,325,-33.0,...,11166124.4,20357.4,1403378.0,11.155459,8.303014,6.140531,0.014629,0.682689,0.020899,1
4,1,AA,AA,CLT,DCA,790,877,87.0,331,-12.0,...,22567735.0,143488.4,11486941.8,11.424054,2.215822,11.185224,0.037967,0.952428,0.021230,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156212,1,B6,B6,PSE,JFK,199,378,239.0,1617,22.0,...,97931.4,218442.4,30030837.0,30.125000,28.800000,2.030151,0.007778,0.291784,0.011717,0
156213,1,UA,OO,DFW,SFO,350,480,250.0,1464,-12.0,...,32777698.0,210816.0,26804926.0,12.765763,8.201957,4.195335,0.011228,0.378460,0.002832,1
156214,1,AA,PT,RIC,PHL,1023,1094,71.0,198,18.0,...,1930904.8,179280.0,15189416.0,13.911175,5.887719,1.300662,0.015607,0.227837,0.010617,1
156215,1,AS,AS,SFO,PDX,1085,1191,106.0,550,-10.0,...,26653215.2,98685.6,9321944.4,12.294051,9.776985,0.241091,0.007271,0.207611,0.018281,0


In [186]:
df = df.drop(['origin','dest','op_unique_carrier','combo'],axis=1)

In [187]:
df.head()

Unnamed: 0,constant,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,month,weekday,avg_distance_per_month_by_carrier,...,yearly_passengers_per_ap,yearly_arrivals_per_ap,yearly_arriving_passengers_per_ap,average_departure_delay_by_ap,average_arrival_delay_by_ap,avg_arr_delay_by_month_by_ap,cancelled_flights_by_month_by_airport,avg_weather_delay_by_month_by_ap,security_delay_by_month_by_ap,op_unique_carrier_check
0,1,DL,620,686,66.0,153,-5.0,2,1,1008219.0,...,1092760.2,435839.8,51557297.6,12.627358,6.142405,1.770915,0.006184,0.707568,0.018977,0
1,1,AS,1205,1290,85.0,308,53.0,5,5,350707.6,...,41149051.0,60583.2,6336858.6,8.952404,2.602834,3.435302,0.019439,0.146229,0.00972,0
2,1,DL,940,1018,78.0,270,-13.0,1,1,1008219.0,...,2985380.0,435839.8,51557297.6,6.891824,7.182946,1.034708,0.005504,1.010663,0.004603,0
3,1,UA,1035,1134,99.0,325,-33.0,11,6,1004148.0,...,11166124.4,20357.4,1403378.0,11.155459,8.303014,6.140531,0.014629,0.682689,0.020899,1
4,1,AA,790,877,87.0,331,-12.0,7,1,1165293.0,...,22567735.0,143488.4,11486941.8,11.424054,2.215822,11.185224,0.037967,0.952428,0.02123,0


In [188]:
dummy_df = pd.get_dummies(df,columns=['mkt_unique_carrier','month','weekday'])

In [189]:
#drop final category of each dummy variable
dummy_df = dummy_df.drop(columns=['month_12','weekday_6','mkt_unique_carrier_WN'])

In [190]:
X = dummy_df.drop('arr_delay',axis=1)
y = dummy_df['arr_delay']

In [191]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [192]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(124973, 49) (31244, 49) (124973,) (31244,)


XGBoost Regression Model

In [212]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

In [194]:
model = xgb.XGBRegressor(random_state = 13)

In [195]:
parameters = {'nthread':[4], 
              'objective':['reg:squarederror'],
              'learning_rate': [.01, .03, 0.05, .07, .09], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

In [205]:
# xgb_grid = GridSearchCV(model, parameters, cv=5, n_jobs = 5, verbose = True)

# xgb_grid.fit(X_train, y_train)

print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
0.02459843635837038
{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:squarederror', 'subsample': 0.7}


In [206]:
model = xgb.XGBRegressor(colsample_bytree = .7, learning_rate = .01, max_depth = 5, min_child_weight = 4, n_estimators = 500, nthread=4, objective = 'reg:squarederror', subsample = .7, random_state=13)

In [207]:
model.fit(X_train,y_train)

In [210]:
score = model.score(X_train,y_train)
print("Training Scores: ", score)

Training Scores:  0.0640188129611573


In [211]:
test_score = model.score(X_test,y_test)
print("Testing Scores: ", test_score)

Testing Scores:  0.024121071839827368


In [214]:
#so far slight improvement over linear model
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 2420.44
RMSE: 49.20
