In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../CleaningTableFlights/data/model_df_version_1.csv')
dep_delay = pd.read_csv('../CleaningTableFlights/data/flights.csv')[['dep_delay','origin']]

In [3]:
dep_delay = dep_delay.groupby('origin').mean()
dep_delay = dep_delay.rename({'dep_delay':'average_departure_delay_by_ap'},axis=1)

In [4]:
arr_delay = df[['dest','arr_delay']]
arr_delay = arr_delay.groupby('dest').mean()
arr_delay = arr_delay.rename({'arr_delay':'average_arrival_delay_by_ap'},axis=1)

In [5]:
df = df.drop(['tail_num','mkt_carrier_fl_num'], axis = 1)

In [6]:
pass_fuel_df = pd.read_csv('../EDA_questions/data/passenger_fuel_df.csv')

In [7]:
departures_df = pd.read_csv('../EDA_questions/data/departures_by_airport.csv')
departures_df = departures_df.drop('origin_city_name',axis=1)

In [8]:
arrivals_df = pd.read_csv('../EDA_questions/data/arrivals_by_airport.csv')
arrivals_df = arrivals_df.drop('dest_city_name',axis=1)

In [9]:
month_dest_df = pd.read_csv('data/features_dest_monthly_params.csv')
month_origin_df = pd.read_csv('data/features_origin_monthly_params.csv')

In [10]:
df['month'] = pd.DatetimeIndex(df['fl_date']).month
df['weekday'] = pd.DatetimeIndex(df['fl_date']).weekday

In [11]:
month_dest_df['combo1']=month_dest_df['dest'].astype('str')+month_dest_df['fl_month'].astype('str')
df['combo1']=df['dest'].astype('str')+df['month'].astype('str')

month_dest_df = month_dest_df.drop(['fl_month','dest'],axis=1)
month_dest_df = month_dest_df.rename(columns={'arr_delay':'avg_arr_delay_by_month_by_dest_ap','cancelled':'cancelled_flights_by_month_by_dest_ap','weather_delay':'avg_weather_delay_by_month_by_dest_ap','security_delay':'security_delay_by_month_by_dest_ap'})

In [12]:
month_origin_df['combo2']=month_origin_df['origin'].astype('str')+month_origin_df['fl_month'].astype('str')
df['combo2']=df['origin'].astype('str')+df['month'].astype('str')

month_origin_df = month_origin_df.drop(['fl_month','origin'],axis=1)
month_origin_df = month_origin_df.rename(columns={'arr_delay':'avg_arr_delay_by_month_by_origin_ap','cancelled':'cancelled_flights_by_month_by_origin_ap','weather_delay':'avg_weather_delay_by_month_by_origin_ap','security_delay':'security_delay_by_month_by_origin_ap'})

In [13]:
df = df.drop(['fl_date'],axis=1)

In [14]:
df['crs_arr_time'] = (df['crs_arr_time']//100)*60 + df['crs_arr_time'] - (df['crs_arr_time']//100)*100
df['crs_dep_time'] = (df['crs_dep_time']//100)*60 + df['crs_dep_time'] - (df['crs_dep_time']//100)*100

In [15]:
#add in fuel and passenger info by carrier
df = df.merge(pass_fuel_df, how = 'left', left_on='mkt_unique_carrier',right_on='mkt_unique_carrier')

In [16]:
#add in departure and passenger info by airport
df = df.merge(departures_df, how = 'left', on = 'origin')

#add in arrival and passenger info by arrival airport
df = df.merge(arrivals_df, how = 'left', on = 'dest')

#add in average departure delay by departure location
df = df.merge(dep_delay, how = 'left', left_on = 'origin', right_index = True)

#add in average arrival delay by arrival location
df = df.merge(arr_delay, how = 'left', left_on = 'origin', right_index = True)

#add in monthly data
df = df.merge(month_dest_df, how = 'left', on = 'combo1')
df = df.merge(month_origin_df, how = 'left', on = 'combo2')

In [17]:
#add constant
df['constant'] = 1
first_column = df.pop('constant')
df.insert(0, 'constant', first_column)

In [18]:
#create variable where 1 if op_carrier different from mkt_carrier
df['op_unique_carrier_check'] = df['op_unique_carrier'] != df['mkt_unique_carrier']
df['op_unique_carrier_check'] = df['op_unique_carrier_check'].astype('int')

In [19]:
#fill arrival/departure average nans with 0
df = df.fillna(0)

In [20]:
df = df.drop(['origin','dest','op_unique_carrier','combo1','combo2'],axis=1)

In [21]:
df.head()

Unnamed: 0,constant,mkt_unique_carrier,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,cancelled_flights,month,weekday,...,avg_weather_delay_by_month_by_dest_ap,security_delay_by_month_by_dest_ap,dep_delay,cancelled_flights_by_month_by_origin_ap,carrier_delay,nas_delay,late_aircraft_delay,avg_weather_delay_by_month_by_origin_ap,security_delay_by_month_by_origin_ap,op_unique_carrier_check
0,1,AA,1189,1255,66.0,88,0.0,1,9,0,...,0.307875,0.01894,5.271769,0.035496,3.159179,2.154742,3.652801,0.39157,0.0,1
1,1,AA,990,1060,70.0,247,0.0,1,5,5,...,0.449975,0.052752,15.296294,0.034971,5.05282,3.096598,7.28402,1.814784,0.003468,1
2,1,AA,940,1056,116.0,477,0.0,1,5,1,...,1.276755,0.008862,12.74743,0.027785,4.24734,3.37831,6.843642,0.830531,0.00908,1
3,1,WN,595,670,75.0,417,0.0,1,2,2,...,0.589523,0.033526,9.822994,0.021385,3.519373,2.293352,5.45109,0.210036,0.017362,0
4,1,UA,600,705,105.0,445,0.0,1,6,5,...,1.949057,0.000331,22.953682,0.030129,5.896596,5.225984,12.57971,1.143489,0.012169,1


In [22]:
dummy_df = pd.get_dummies(df,columns=['mkt_unique_carrier','month','weekday'])

In [23]:
#drop final category of each dummy variable
dummy_df = dummy_df.drop(columns=['month_12','weekday_6','mkt_unique_carrier_WN'])

In [24]:
X = dummy_df.drop('arr_delay',axis=1)
y = dummy_df['arr_delay']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .2)

In [26]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(72751, 57) (18188, 57) (72751,) (18188,)


XGBoost Regression Model

In [27]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

In [28]:
model = xgb.XGBRegressor(random_state = 13)

In [29]:
parameters = {'nthread':[4], 
              'objective':['reg:squarederror'],
              # 'learning_rate': [.01, .03, 0.05, .07, .09], 
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500],
              'alpha': [0,.1,.25,.5,1,2,5]}

In [None]:
# xgb_grid = GridSearchCV(model, parameters, cv=3, n_jobs = 5, verbose = True)

# xgb_grid.fit(X_train, y_train)

# print(xgb_grid.best_score_)
# print(xgb_grid.best_params_)

In [71]:
model = xgb.XGBRegressor(colsample_bytree = .7, 
                         learning_rate = .01, 
                         max_depth = 5, 
                         min_child_weight = 4, 
                         n_estimators = 500, 
                         nthread=4, 
                         objective = 'reg:squarederror', 
                         subsample = .7, random_state=13, 
                         reg_lambda = 10, alpha = 5, 
                         gamma=5)

In [72]:
model.fit(X_train,y_train)

In [73]:
score = model.score(X_train,y_train)
print("Training Scores: ", score)

Training Scores:  0.07568226799539812


In [74]:
test_score = model.score(X_test,y_test)
print("Testing Scores: ", test_score)

Testing Scores:  0.03145678761287263


In [75]:
#so far slight improvement over linear model
ypred = model.predict(X_test)
mse = mean_squared_error(y_test, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))

MSE: 1431.75
RMSE: 37.84
