## ML model testing

* logistic / multinomial logistic regression



In [1240]:
import pandas as pd
pd.set_option('display.max_columns',50)
#pd.set_option('display.width',1000)
pd.set_option('display.max_rows', 50)
import numpy as np
from numpy import argmax
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
import xgboos as xgb
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import norm
from sklearn.utils import resample
from sklearn import linear_model

from collections import Counter

In [1179]:
df = pd.read_csv('/Users/wjdol/Desktop/LighthouseLabs/Flight_delays/data/flights_complete_raw.csv', parse_dates=['fl_date'])

In [1180]:
df['fl_day'] = df['fl_date'].dt.weekday

In [1182]:
df.head()

Unnamed: 0,mkt_unique_carrier,origin,dest,fl_day,week_num,dep_time_of_day,arr_time_of_day,departures_performed,payload,passengers,distance_group,arr_delay,tdomt_gallons,tdomt_cost,AWND,PRCP,SNOW,SNWD,TAVG
0,DL,FLL,ATL,6,2,evening,evening,3.0,118728.0,312.5,2.0,-8.0,149458650.6,288453038.8,31.0,43.0,0.0,0.0,234.0
1,AA,DCA,DFW,0,3,morning,morning,111.5,4801900.0,14754.5,3.0,75.0,163140817.6,265791415.6,30.0,0.0,3.0,250.0,-2.0
2,AA,ORD,IND,3,2,afternoon,afternoon,23.0,349600.0,1603.0,1.0,-7.0,6214610.8,12885113.2,20.0,0.0,0.0,0.0,-78.0
3,UA,PIT,SFO,4,2,afternoon,evening,3.0,104033.0,429.0,5.0,-7.0,115971276.2,193359267.6,29.0,0.0,0.0,0.0,-52.0
4,DL,MSP,GFK,3,1,evening,overnight,29.0,565920.0,1876.0,1.0,-15.0,6214610.8,12885113.2,33.0,0.0,0.0,0.0,-23.0


In [1181]:
# drop columns not used

df = df.drop(['Unnamed: 0', 'op_unique_carrier', 'origin_city', 'dest_city', 'fl_date', 'delay_binary','identifier', 'index', 'date_orig_id', 'weather_id', 'date_str', 'aircraft_type', 'distance', 'sdomt_gallons', 'sdomt_cost', 'freight', 'air_time'], axis=1)

In [1019]:
df[df['tdomt_gallons']==0]['mkt_unique_carrier'].unique()

array(['AA', 'UA', 'DL', 'AS'], dtype=object)

In [1183]:
df.loc[df['tdomt_gallons']==0, 'tdomt_gallons'] = np.mean(df['tdomt_gallons'])

In [1184]:
df.loc[df['tdomt_cost']==0,'tdomt_cost'] = np.mean(df['tdomt_cost'])

In [1185]:

df = df.drop(df[df['passengers']==0].index)

In [1186]:
dflreg = df.copy()

In [1187]:
# bins for numerical data

dflreg['SNWD'] = pd.cut(dflreg['SNWD'], bins=[0, 0.1, 60, 150, np.inf], labels=['0','1','2','3'], include_lowest=True)

dflreg['PRCP'] = pd.cut(dflreg['PRCP'], bins=[0, 0.1, 75, 250, np.inf], labels=['0','1','2','3'], include_lowest=True)

In [1188]:
# make bins for grouping

airlines_list = dict(pd.qcut(dflreg['mkt_unique_carrier'].value_counts(), q=[0, 0.45, 0.9, 1.0], labels=['1','2','3']))

origin_list = dict(pd.qcut(dflreg['origin'].value_counts(), q=[0, 0.25, 0.5, 1.0], labels=['1','2','3']))

dest_list = dict(pd.qcut(dflreg['dest'].value_counts(), q=[0, 0.25, 0.5, 1.0], labels=['1','2','3']))


In [1189]:
dflreg['mkt_unique_carrier'].replace(airlines_list, inplace=True)
dflreg['origin'].replace(origin_list, inplace=True)
dflreg['dest'].replace(dest_list, inplace=True)

In [719]:
# create list of value counts by columns

# identifier_count = Counter(dict(dflreg['identifier'].value_counts()))
# carrier_count = Counter(dict(dflreg['mkt_unique_carrier'].value_counts()))
# #originc_count = Counter(dict(dflreg['origin_city'].value_counts()))
# origin_count = Counter(dict(dflreg['origin'].value_counts()))
# #destc_count = Counter(dict(dflreg['dest_city'].value_counts()))
# dest_count = Counter(dict(dflreg['dest'].value_counts()))
# wkday_count = Counter(dict(dflreg['fl_day'].value_counts()))
# #type_count = Counter(dict(dflreg['aircraft_type'].value_counts()))
# payl_count = Counter(dict(dflreg['payload'].value_counts()))

In [1192]:
dflreg.head()

Unnamed: 0,mkt_unique_carrier,origin,dest,fl_day,week_num,dep_time_of_day,arr_time_of_day,departures_performed,payload,passengers,distance_group,arr_delay,tdomt_gallons,tdomt_cost,AWND,PRCP,SNOW,SNWD,TAVG,costPerGallon,costPerPerson,gallonPerPerson
0,2,3,3,6,2,evening,evening,3.0,118728.0,312.5,2.0,-8.0,149458650.6,288453038.8,31.0,1,0.0,0,234.0,1.929986,923049.72416,478267.68192
1,3,3,3,0,3,morning,morning,111.5,4801900.0,14754.5,3.0,75.0,163140817.6,265791415.6,30.0,0,3.0,3,-2.0,1.629215,18014.261114,11057.021085
2,3,3,3,3,2,afternoon,afternoon,23.0,349600.0,1603.0,1.0,-7.0,6214610.8,12885113.2,20.0,0,0.0,0,-78.0,2.073358,8038.124267,3876.862633
3,2,1,3,4,2,afternoon,evening,3.0,104033.0,429.0,5.0,-7.0,115971276.2,193359267.6,29.0,0,0.0,0,-52.0,1.667303,450720.903497,270329.315152
4,2,3,2,3,1,evening,overnight,29.0,565920.0,1876.0,1.0,-15.0,6214610.8,12885113.2,33.0,0,0.0,0,-23.0,2.073358,6868.397228,3312.692324


In [1191]:
# add new features based on fuel consumption, fuel cost, and passengers

dflreg['costPerGallon'] = dflreg['tdomt_cost']/dflreg['tdomt_gallons']
dflreg['costPerPerson'] = dflreg['tdomt_cost']/dflreg['passengers']
dflreg['gallonPerPerson'] = dflreg['tdomt_gallons']/dflreg['passengers']


In [645]:
# determine quartile splits

# top_carriers = top_categories(carrier_count, 0.90)
# top_dest = top_categories(dest_count, 0.5)
# top_origin = top_categories(origin_count, 0.5)
# top_identifier = top_categories(identifier_count, 0.05)

In [990]:
dflreg.head()

Unnamed: 0,mkt_unique_carrier,origin,dest,fl_day,week_num,dep_time_of_day,arr_time_of_day,departures_performed,payload,passengers,distance_group,delay_binary,tdomt_gallons,tdomt_cost,AWND,PRCP,SNOW,SNWD,TAVG,costPerGallon,costPerPerson,gallonPerPerson
0,2,3,3,6,2,evening,evening,3.0,118728.0,312.5,2.0,0.0,149458650.6,288453038.8,31.0,1,0.0,0,234.0,1.929986,923049.72416,478267.68192
1,3,3,3,0,3,morning,morning,111.5,4801900.0,14754.5,3.0,1.0,163140817.6,265791415.6,30.0,0,3.0,3,-2.0,1.629215,18014.261114,11057.021085
2,3,3,3,3,2,afternoon,afternoon,23.0,349600.0,1603.0,1.0,0.0,6214610.8,12885113.2,20.0,0,0.0,0,-78.0,2.073358,8038.124267,3876.862633
3,2,1,3,4,2,afternoon,evening,3.0,104033.0,429.0,5.0,0.0,115971276.2,193359267.6,29.0,0,0.0,0,-52.0,1.667303,450720.903497,270329.315152
4,2,3,2,3,1,evening,overnight,29.0,565920.0,1876.0,1.0,0.0,6214610.8,12885113.2,33.0,0,0.0,0,-23.0,2.073358,6868.397228,3312.692324


In [1193]:
dflreg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33503 entries, 0 to 33510
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   mkt_unique_carrier    33503 non-null  object  
 1   origin                33503 non-null  object  
 2   dest                  33503 non-null  object  
 3   fl_day                33503 non-null  int64   
 4   week_num              33503 non-null  int64   
 5   dep_time_of_day       33503 non-null  object  
 6   arr_time_of_day       33503 non-null  object  
 7   departures_performed  33503 non-null  float64 
 8   payload               33503 non-null  float64 
 9   passengers            33503 non-null  float64 
 10  distance_group        33503 non-null  float64 
 11  arr_delay             33503 non-null  float64 
 12  tdomt_gallons         33503 non-null  float64 
 13  tdomt_cost            33503 non-null  float64 
 14  AWND                  33503 non-null  float64 
 15  PR

In [1194]:
dflreg = pd.get_dummies(dflreg, columns=['dep_time_of_day','arr_time_of_day'], drop_first=True)

In [1195]:
# split features and target

y = dflreg['arr_delay']
X = dflreg.drop('arr_delay', axis=1)

In [1196]:
# train / test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [1197]:
num_X_train = X_train.select_dtypes(include=['float64'])
num_X_test = X_test.select_dtypes(include=['float64'])

In [977]:
num_X_train

Unnamed: 0,fl_day,week_num,departures_performed,payload,passengers,air_time,distance_group,tdomt_gallons,tdomt_cost,AWND,SNOW,TAVG,costPerGallon,costPerPerson,gallonPerPerson,dep_time_of_day_evening,dep_time_of_day_morning,dep_time_of_day_overnight,arr_time_of_day_evening,arr_time_of_day_morning,arr_time_of_day_overnight
10787,2,2,214.0,7404400.0,22294.5,13395.0,1.0,156532592.0,293803944.4,18.0,0.0,152.0,1.876951,13178.315028,7021.130413,0,0,0,1,0,0
6024,2,3,30.0,1088100.0,2706.0,3191.0,2.0,163140817.6,265791415.6,13.0,0.0,137.0,1.629215,98222.991722,60288.550480,1,0,0,0,0,1
5920,2,5,1.0,34860.0,111.0,267.0,5.0,44216757.8,83115270.4,11.0,0.0,56.0,1.879723,748786.219820,398349.169369,0,0,0,1,0,0
6783,3,1,38.0,1525000.0,4750.0,1466.0,1.0,156532592.0,293803944.4,47.0,0.0,110.0,1.876951,61853.461979,32954.229895,0,1,0,0,0,0
9278,3,4,79.5,749500.0,2910.5,4269.5,1.0,54337.0,144824.4,16.0,0.0,59.0,2.665300,49.759285,18.669301,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14904,5,4,27.5,1427600.0,4192.5,7743.5,4.0,163140817.6,265791415.6,14.0,0.0,8.0,1.629215,63396.879094,38912.538485,0,0,0,1,0,0
12963,2,3,35.0,1597300.0,5380.0,10172.0,6.0,163140817.6,265791415.6,99.0,0.0,124.0,1.629215,49403.608848,30323.572045,0,1,0,0,0,0
17367,3,2,3.0,118728.0,312.5,255.0,2.0,149458650.6,288453038.8,45.0,0.0,223.0,1.929986,923049.724160,478267.681920,0,0,0,1,0,0
1092,3,4,128.0,2790784.0,8922.0,7417.0,1.0,5327632.2,7937670.4,62.0,0.0,154.0,1.489906,889.673885,597.134297,1,0,0,1,0,0


In [1198]:
# dflreg.isnull().sum()

In [1199]:
cat_X_train = X_train.select_dtypes(exclude=['float64'])
cat_X_test = X_test.select_dtypes(exclude=['float64'])

In [1200]:
scaler = StandardScaler()

scaler.fit(num_X_train)

scaled_X_train = scaler.transform(num_X_train)
scaled_X_test = scaler.transform(num_X_test)


In [1201]:
cat_X_train_f = cat_X_train.to_numpy()
cat_X_test_f = cat_X_test.to_numpy()

In [1202]:
X_train_f = np.hstack((cat_X_train_f, scaled_X_train))
X_test_f = np.hstack((cat_X_test_f, scaled_X_test))

In [1212]:
# create xgb initial model

xgb_0 = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

xgb_0.fit(X_train_f, y_train)

LinearRegression(n_jobs=-1)

In [1218]:
y_pred = xgb_0.predict(X_test_f)
mse = mean_squared_error(y_test, y_pred)

0.03334078319974143

In [1213]:
xgb_0.coef_

array([ 1.47434078, -0.22612743, -0.07543796, -1.02260451, -1.27317749,
        4.57765722,  2.75210605, -0.44371827, -2.11621184, -5.47247965,
        4.21850789, -2.03412155,  4.33138477, -0.35887425, -2.61143139,
        3.71699052, -1.5256487 ,  4.10874168, -7.74407892,  1.91128615,
        3.87809002, -2.04161445,  0.34794298, -1.76961979,  1.7960179 ])

In [1214]:
# coeffs = np.hstack((xgb_0.intercept_, xgb_0.coef_))
# coeffs_tbl = pd.DataFrame(data={'variable': ['intercept'] + list(X.columns) , 'coefficient': coeffs})

In [1]:
# plt.bar([x for x in range(len(xgb_0.coef_))], xgb_0.coef_)

In [2]:
# coeffs_tbl.sort_values(by='coefficient', ascending=False)

In [1224]:
# y_pred = xgb_0.predict(X_test_f)
# plt.scatter((X_test_f).tolist(), y_test)
# # plt.plot((X_test_f).tolist(), y_pred)


In [1241]:
params = {'max_depth': [5, 10, 15],
           'learning_rate': [0.01, 0.1, 0.2],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500]}

xgb_1 = XGBRegressor()

reg_1 = RandomizedSearchCV(estimator=xgb_1, param_distributions=params, scoring='neg_mean_squared_error', n_iter=15)

In [1242]:
reg_1.fit(X_train_f, y_train)

Lasso(alpha=0.1)

In [1245]:
xgb_1.best_params_(X_test_f, y_test)
xgb_1.best_score_

0.03349809760423428