In [3]:
#importing libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#Read the feature selection csv
df = pd.read_csv('flights_features.csv')
df.head()

Unnamed: 0,carrier_name,origin,dest,dep_delay,taxi_out,taxi_in,arr_delay,crs_elapsed_time,distance,dep_timeframe,arr_timeframe,fl_month,delay_status
0,WN,BUR,SFO,38.0,18.0,7.0,51.0,80,326,6am-12pm,12pm-6pm,May,3
1,AA,ORD,DFW,-8.0,17.0,8.0,-35.0,153,801,12pm-6pm,12pm-6pm,April,0
2,WN,MSY,HOU,29.0,5.0,4.0,18.0,75,302,6pm-12am,6pm-12am,February,2
3,AA,DFW,IND,-1.0,30.0,10.0,13.0,128,761,6pm-12am,6pm-12am,November,2
4,DL,JFK,IAD,-6.0,16.0,3.0,-34.0,98,228,12pm-6pm,12pm-6pm,August,0


## Random Forest

In [17]:
#importing libraries to set up RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [28]:
#Use StandardScaler for numerical values
scaler = StandardScaler()

num_list = ['dep_delay', 'taxi_out', 'taxi_in', 'crs_elapsed_time', 'distance']
df_num = df[num_list]
df_cat = df.drop(num_list, axis=1)
df_num_scaled = pd.DataFrame(scaler.fit_transform(df_num), columns=df_num.columns)
df_scaled = df_num_scaled.join(df_cat)
df_scaled.head()

Unnamed: 0,dep_delay,taxi_out,taxi_in,crs_elapsed_time,distance,carrier_name,origin,dest,arr_delay,dep_timeframe,arr_timeframe,fl_month,delay_status
0,0.55983,0.047859,-0.110639,-0.810741,-0.757964,WN,BUR,SFO,51.0,6am-12pm,12pm-6pm,May,3
1,-0.379671,-0.051905,0.051789,0.208956,0.052864,AA,ORD,DFW,-35.0,12pm-6pm,12pm-6pm,April,0
2,0.376014,-1.249067,-0.597925,-0.880583,-0.798932,WN,MSY,HOU,18.0,6pm-12am,6pm-12am,February,2
3,-0.236703,1.245021,0.376646,-0.140255,-0.015416,AA,DFW,IND,13.0,6pm-12am,6pm-12am,November,2
4,-0.338823,-0.151668,-0.760353,-0.559309,-0.925251,DL,JFK,IAD,-34.0,12pm-6pm,12pm-6pm,August,0


In [29]:
#setting up dummy variables for the categories
'''
The origin and destination will be dropped for this part because of how many unique values it has.
The concern is that making all of those into dummy variable will mess up the table.
'''

df_scaled2 = df_scaled.drop(['origin', 'dest'], axis=1)
df_encoded = pd.get_dummies(df_scaled2, columns=['carrier_name','dep_timeframe', 'arr_timeframe', 'fl_month'])
df_encoded.head()

Unnamed: 0,dep_delay,taxi_out,taxi_in,crs_elapsed_time,distance,arr_delay,delay_status,carrier_name_AA,carrier_name_AS,carrier_name_B6,...,fl_month_December,fl_month_February,fl_month_January,fl_month_July,fl_month_June,fl_month_March,fl_month_May,fl_month_November,fl_month_October,fl_month_September
0,0.55983,0.047859,-0.110639,-0.810741,-0.757964,51.0,3,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,-0.379671,-0.051905,0.051789,0.208956,0.052864,-35.0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.376014,-1.249067,-0.597925,-0.880583,-0.798932,18.0,2,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,-0.236703,1.245021,0.376646,-0.140255,-0.015416,13.0,2,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.338823,-0.151668,-0.760353,-0.559309,-0.925251,-34.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
#Set delay status as target variable
x1_rf = df_encoded.drop(['arr_delay','delay_status', 'dep_delay'], axis=1)
y1_rf = df_encoded['delay_status']

#Set arrival delay as target variable
x2_rf = df_encoded.drop(['arr_delay','delay_status', 'dep_delay'], axis=1)
y2_rf = df_encoded['arr_delay']

print(x1_rf.shape, x2_rf.shape, y1_rf.shape, y2_rf.shape)

(196193, 35) (196193, 35) (196193,) (196193,)


In [31]:
#splitting data into test/train
x_train, x_test, y_train, y_test = train_test_split(x1_rf, y1_rf, test_size = 0.2)
x_train2, x_test2, y_train2, y_test2 = train_test_split(x2_rf, y2_rf, test_size = 0.2)

x_train.shape, x_test.shape, x_train2.shape, x_test2.shape

((156954, 35), (39239, 35), (156954, 35), (39239, 35))

In [32]:
#set up a grid search
param = {'max_depth':[2,5,10], 'n_estimators':[50,100,200]}

rf = RandomForestClassifier(class_weight='balanced')
rf2 = RandomForestClassifier()

grid_search = GridSearchCV(estimator=rf, param_grid=param)
grid_search = GridSearchCV(estimator=rf2, param_grid=param)

In [19]:
#Run the first grid search for categorical delay
grid_search.fit(x_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 10, 'n_estimators': 50}
0.49192119509116383


In [33]:
grid_search2.fit(x_train2, y_train2)
print(grid_search2.best_params_)
print(grid_search2.best_score_)



{'max_depth': 10, 'n_estimators': 100}
0.032965068758534985


In [34]:
#Setting up the RFC for testing using the result from the grid search
clf = RandomForestClassifier(max_depth=10, n_estimators=50, class_weight='balanced')
clf2 = RandomForestClassifier(max_depth=10, n_estimators=100)

In [35]:
#training prediction model

%%time
clf.fit(x_train, y_train)
clf2.fit(x_train2, y_train2)

CPU times: total: 11.1 s
Wall time: 19.8 s


In [37]:
#making predictions and checking accuracy
from sklearn import metrics

y_pred = clf.predict(x_test)
y_pred2 = clf.predict(x_test2)

print("Parameters: ")
print("Accuracy \t{:.3f}".format(metrics.accuracy_score(y_test, y_pred)))
print("Accuracy_test2 \t{:.3f}".format(metrics.accuracy_score(y_test2, y_pred2)))

Parameters: 
Accuracy 	0.484
Accuracy_test2 	0.018
