In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline

In [3]:
data = pd.read_csv('T100_2019.csv')
data = data.dropna(subset=['UNIQUE_CARRIER_NAME', 'ORIGIN', 'DEST'])

In [4]:
data['LOAD_FACTOR'] = data['PASSENGERS']/data['SEATS']
data[data['LOAD_FACTOR']>1]=1

In [5]:
# list(data)

In [6]:
scheduledflights = data[(data['CLASS'].isin(['A','C','E','F'])) & (data['SEATS']>0)] #Select only scheduled commercial flights and flights with available seats

In [7]:
import csv

df_airports = scheduledflights[['PASSENGERS','ORIGIN']]
df_airports = df_airports.groupby(['ORIGIN']).sum()
df_airports = df_airports.sort_values('PASSENGERS', axis=0, ascending=False)
df_airports = df_airports[df_airports['PASSENGERS']>350000]
print(df_airports.shape)
airports = df_airports.index.to_numpy()
airports = np.append(airports, 'CMI')
airports = np.sort(airports)
with open('Top_Airports.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    for item in airports:
        writer.writerow([item])

df_carrier = scheduledflights[['PASSENGERS','UNIQUE_CARRIER_NAME']]
df_carrier = df_carrier.groupby(['UNIQUE_CARRIER_NAME']).sum()
df_carrier = df_carrier.sort_values('PASSENGERS', axis=0, ascending=False)
df_carrier = df_carrier[df_carrier['PASSENGERS']>100000]
print(df_carrier.shape)
carriers = np.sort(df_carrier.index.to_numpy())
carriers = np.append('No Selection', carriers)
with open('Top_Carriers.csv', 'w', newline='') as outfile:
    writer = csv.writer(outfile)
    for item in carriers:
        writer.writerow([item])

(154, 1)
(36, 1)


In [8]:
scheduledflights = scheduledflights[(scheduledflights['ORIGIN'].isin(airports)) & (scheduledflights['DEST'].isin(airports)) & (scheduledflights['UNIQUE_CARRIER_NAME'].isin(carriers))]
scheduledflights = scheduledflights.reset_index(drop=True)
# datasubset = scheduledflights[['UNIQUE_CARRIER_NAME','ORIGIN','DEST','DISTANCE','DEPARTURES_SCHEDULED','MONTH']]
datasubset = scheduledflights[['UNIQUE_CARRIER_NAME','ORIGIN','DEST','SEATS','DISTANCE','DEPARTURES_SCHEDULED','MONTH','AIRCRAFT_TYPE']]
datasubset.to_csv('All_Flights.csv', index = False)

In [9]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
# ct = ColumnTransformer([('OH-ENCODE', OneHotEncoder(), [0,1,2,3])], remainder = 'passthrough')
ct = ColumnTransformer([('OH-ENCODE', OneHotEncoder(), [0,1,2,6,7]),('MinMax', MinMaxScaler(),[3,4,5])], remainder = 'drop')
X = ct.fit_transform(datasubset)
y = scheduledflights['LOAD_FACTOR']*100
pickle.dump(ct, open('OneHotEncoder.pkl', 'wb'))

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

## Random Forest Regressor

In [173]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle

RF_model = RandomForestRegressor(max_depth=25, n_estimators=50)
# RF_model = RandomForestRegressor()
RF_model.fit(X_train, y_train)
score = RF_model.score(X_train, y_train)
print("R-squared:", score)
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))
pickle.dump(RF_model, open('Load_Factor_RFR_Model.pkl', 'wb'))

R-squared: 0.6360004433250757
Mean Error:  13.465582552431353


In [174]:
errors = np.abs((y_test.values - y_pred))
print('Min Error =',min(errors))
print('Max Error =',max(errors))
print('Mean Error =',np.mean(errors))
print('Cases with less than 5% error =',100*np.sum(errors<5)/len(y_pred),'%')
print('Cases with less than 10% error =',100*np.sum(errors<10)/len(y_pred),'%')
print('Cases with less than 20% error =',100*np.sum(errors<20)/len(y_pred),'%')

Min Error = 0.0
Max Error = 94.28571428571428
Mean Error = 8.884679270474246
Cases with less than 5% error = 43.73463857319917 %
Cases with less than 10% error = 70.92410142217928 %
Cases with less than 20% error = 90.59092914312123 %


## SGD Regressor

In [14]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# SGD_model = SGDRegressor(alpha = 1e-06, l1_ratio = 0.6, max_iter = 1000, penalty = 'elasticnet')
SGD_model = SGDRegressor()
SGD_model.fit(X_train, y_train)
score = SGD_model.score(X_train, y_train)
print("R-squared:", score)
y_pred = SGD_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))
pickle.dump(SGD_model, open('Load_Factor_SGDR_Model.pkl', 'wb'))

R-squared: 0.17593061562723344
Mean Error:  14.996130804558849


In [15]:
errors = np.abs((y_test.values - y_pred))
print('Min Error =',min(errors))
print('Max Error =',max(errors))
print('Mean Error =',np.mean(errors))
print('Cases with less than 5% error =',100*np.sum(errors<5)/len(y_pred),'%')
print('Cases with less than 10% error =',100*np.sum(errors<10)/len(y_pred),'%')
print('Cases with less than 20% error =',100*np.sum(errors<20)/len(y_pred),'%')

Min Error = 7.696352187736011e-05
Max Error = 91.00349350965492
Mean Error = 10.071205027992367
Cases with less than 5% error = 36.68871391076115 %
Cases with less than 10% error = 64.37375328083989 %
Cases with less than 20% error = 89.36272965879265 %


## Ridge Regressor

In [16]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Ridge_model = Ridge(alpha = 10, max_iter = 1000, tol = 0.0000001)
Ridge_model = Ridge()
Ridge_model.fit(X_train, y_train)
score = Ridge_model.score(X_train, y_train)
print("R-squared:", score)
y_pred = Ridge_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))
pickle.dump(Ridge_model, open('Load_Factor_RR_Model.pkl', 'wb'))

R-squared: 0.17741967187227126
Mean Error:  14.987143094066214


In [17]:
errors = np.abs((y_test.values - y_pred))
print('Min Error =',min(errors))
print('Max Error =',max(errors))
print('Mean Error =',np.mean(errors))
print('Cases with less than 5% error =',100*np.sum(errors<5)/len(y_pred),'%')
print('Cases with less than 10% error =',100*np.sum(errors<10)/len(y_pred),'%')
print('Cases with less than 20% error =',100*np.sum(errors<20)/len(y_pred),'%')

Min Error = 0.00028437904674660786
Max Error = 91.14775654117496
Mean Error = 10.065608019021314
Cases with less than 5% error = 36.888188976377954 %
Cases with less than 10% error = 64.449343832021 %
Cases with less than 20% error = 89.23464566929134 %


## Support Vector Regressor

In [18]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pickle

SVR_model = SVR()
SVR_model.fit(X_train, y_train)
score = SVR_model.score(X_train, y_train)
print("R-squared:", score)
y_pred = SVR_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))
pickle.dump(SVR_model, open('Load_Factor_SVR_Model.pkl', 'wb'))

R-squared: 0.18394945552141095
Mean Error:  15.069733499568917


In [19]:
errors = np.abs((y_test.values - y_pred))
print('Min Error =',min(errors))
print('Max Error =',max(errors))
print('Mean Error =',np.mean(errors))
print('Cases with less than 5% error =',100*np.sum(errors<5)/len(y_pred),'%')
print('Cases with less than 10% error =',100*np.sum(errors<10)/len(y_pred),'%')
print('Cases with less than 20% error =',100*np.sum(errors<20)/len(y_pred),'%')

Min Error = 0.0006187632494913942
Max Error = 93.82713387456499
Mean Error = 8.818762084365845
Cases with less than 5% error = 48.86719160104987 %
Cases with less than 10% error = 73.96535433070866 %
Cases with less than 20% error = 90.91653543307086 %


In [130]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

parameters = {'n_estimators':[50, 75, 100], 'max_depth':[50, 75, 100], 'max_features':[None, 'auto', 'log2']}
rfr = RandomForestRegressor()
cv = GridSearchCV(rfr, parameters, verbose=4, n_jobs = 5, cv = 3)
cv.fit(X_train, y_train)
# Fitting 3 folds for each of 75 candidates, totalling 225 fits
# [Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
# [Parallel(n_jobs=5)]: Done  22 tasks      | elapsed: 54.2min
# [Parallel(n_jobs=5)]: Done 118 tasks      | elapsed: 161.1min
# [Parallel(n_jobs=5)]: Done 225 out of 225 | elapsed: 382.4min finished


print(cv.best_params_)
#{'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}

import csv
with open('results_cv_RFregressor2019.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=cv.cv_results_.keys())
    writer.writeheader()
    writer.writerow(cv.cv_results_)

y_pred = cv.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))
# MSE:  144.03809115310145

Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'max_depth': 100, 'max_features': 'log2', 'n_estimators': 100}
Mean Error:  14.188879079400344


## SGD Regressor GridSearch

In [127]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

parameters = {'penalty':['elasticnet'], 
              'alpha':[0.000001, 0.000005, 0.00001, 0.00005, 0.0001], 
              'l1_ratio':[0, 0.2, 0.4, 0.6, 0.8, 1], 
              'max_iter':[1000,2500,5000,10000]}
sgdr = SGDRegressor()
best_sgdr = GridSearchCV(sgdr, parameters, verbose=4, n_jobs = 5, cv = 3)
best_sgdr.fit(X_train, y_train)

print(best_sgdr.best_params_)

import csv
with open('results_cv_SGDregressor.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=best_sgdr.cv_results_.keys())
    writer.writeheader()
    writer.writerow(best_sgdr.cv_results_)

y_pred = best_sgdr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))

Fitting 3 folds for each of 120 candidates, totalling 360 fits
{'alpha': 1e-06, 'l1_ratio': 0.6, 'max_iter': 1000, 'penalty': 'elasticnet'}
MSE:  235.61281943419226


## Ridge Grid Search

In [128]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

parameters = {'alpha':[0.1, 0.5, 1, 5, 10], 
              'max_iter':[1000,2500,5000,10000],
              'tol':[0.00005, 0.0005, 0.0001, 0.005]}
rr = Ridge()
best_rr = GridSearchCV(rr, parameters, verbose=4, n_jobs = 5, cv = 3)
best_rr.fit(X_train, y_train)

print(best_rr.best_params_)

import csv
with open('results_cv_Ridgeregressor.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=best_rr.cv_results_.keys())
    writer.writeheader()
    writer.writerow(best_rr.cv_results_)

y_pred = best_rr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))

Fitting 3 folds for each of 80 candidates, totalling 240 fits
{'alpha': 10, 'max_iter': 1000, 'tol': 0.0005}
MSE:  235.4622751029074


## SVR Grid Search

In [129]:
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

parameters = {'kernel':['poly','rbf','sigmoid'],
              'degree':[3, 5, 7, 9], 
              'gamma':['scale','auto'],
              'C':[1, 3, 10, 33]}
svr = SVR()
best_svr = GridSearchCV(svr, parameters, verbose=4, n_jobs = 5, cv = 3)
best_svr.fit(X_train, y_train)

print(best_svr.best_params_)

import csv
with open('results_cv_SVRregressor.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=best_svr.cv_results_.keys())
    writer.writeheader()
    writer.writerow(best_svr.cv_results_)

y_pred = best_rr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Error: ", np.sqrt(mse))

Fitting 3 folds for each of 96 candidates, totalling 288 fits
{'C': 33, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Mean Error:  15.344780060427956


In [11]:
def getflightdata(data, origin, destination, carrier, month):
    if carrier == 'No Selection':
        allcarrierdata = data[(data['ORIGIN']==origin) & (data['DEST']==destination) &\
                       (data['MONTH']==month)]
        singlecarrierdata = []
        numflights = allcarrierdata.shape[0]
        numcarriers = len(allcarrierdata['UNIQUE_CARRIER_NAME'].unique())
    else:
        allcarrierdata = data[(data['ORIGIN']==origin) & (data['DEST']==destination) &\
                       (data['MONTH']==month)]
        singlecarrierdata = allcarrierdata[allcarrierdata['UNIQUE_CARRIER_NAME']==carrier]
        numflights = singlecarrierdata.shape[0]
        numcarriers = 1
    return [allcarrierdata, singlecarrierdata, numflights, numcarriers]