In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

In [None]:
# Regression Models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

# ANN
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical

# Regression Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Classification Metrics
from sklearn.metrics import cohen_kappa_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
# !pip3 install --upgrade plotly

In [None]:
df = pd.read_csv("../eda/model_2018.csv")

In [None]:
print(df.shape)
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.columns

# Regression Models

In [None]:
dep_delay_features = [
    'Weather Code Intensity', 'Month', 'Irregular Departure Time',
    'Is Holiday', 'Is Labour Day', 'Is Xmas', 'Departure Delay',
    'Origin Precipitation', 'Origin Windspeed',
    'Monthly Median Departure Delay', 'Airplane Age',
    'Origin Total Operations',
]

X_dep = df[dep_delay_features]

y_dep = df['Departure Delay']

X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(X_dep, y_dep, test_size=0.3, random_state=123)

In [None]:
print(X_dep.shape)
print(y_dep.shape)

In [None]:
linear = LinearRegression()
linear.fit(X_train_dep, y_train_dep)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_dep, y_train_dep)

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=123)
gbr.fit(X_train_dep, y_train_dep)

rfr = RandomForestRegressor(n_estimators=100, max_depth=1, random_state=123)
rfr.fit(X_train_dep, y_train_dep)

y_pred_linear = linear.predict(X_test_dep)
y_pred_lasso = lasso.predict(X_test_dep)
y_pred_gbr = gbr.predict(X_test_dep)
y_pred_rfr = rfr.predict(X_test_dep)

In [None]:
mse_linear = mean_squared_error(y_test_dep, y_pred_linear, squared=False)
r2_linear = r2_score(y_test_dep, y_pred_linear)
mae_linear = mean_absolute_error(y_test_dep, y_pred_linear)

mse_lasso = mean_squared_error(y_test_dep, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test_dep, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test_dep, y_pred_lasso)

mse_gbr = mean_squared_error(y_test_dep, y_pred_gbr, squared=False)
r2_gbr = r2_score(y_test_dep, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test_dep, y_pred_gbr)

mse_rfr = mean_squared_error(y_test_dep, y_pred_rfr, squared=False)
r2_rfr = r2_score(y_test_dep, y_pred_rfr)
mae_rfr = mean_absolute_error(y_test_dep, y_pred_rfr)


fig = go.Figure()
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mse_linear, mse_lasso, mse_gbr, mse_rfr],
                     name='Root Mean Squared Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mae_linear, mae_lasso, mae_gbr, mae_rfr],
                     name='Mean Absolute Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[r2_linear, r2_lasso, r2_gbr, r2_rfr],
                     name='R-squared',
                     text=[f'{r2_linear:.5f}', f'{r2_lasso:.5f}', f'{r2_gbr:.5f}', f'{r2_rfr:.5f}'],
                     textposition='auto'))
fig.update_layout(title='Linear vs Lasso vs Gradient Boosting vs Random Forest Regression Performance Metrics',
                  xaxis_title='Regression Model',
                  yaxis_title='Value')

fig.show()

In [None]:
# Get the coefficients
coef = pd.Series(linear.coef_, index= X_dep.columns)

fig = px.bar(coef, x=coef.index, y=coef.values)

fig.update_layout(
    title='Linear Regression Coefficients',
    xaxis_title='Features',
    yaxis_title='Coefficient Values'
)

fig.show()

# Classification Models

In [None]:
dep_delay_features = [
    'Weather Code Intensity', 'Month', 'Irregular Departure Time',
    'Is Holiday', 'Is Labour Day', 'Is Xmas', 'Departure Delay',
    'Origin Precipitation', 'Origin Windspeed',
    'Monthly Median Departure Delay', 'Airplane Age',
    'Origin Total Operations',
]

X_dep = df[dep_delay_features]

y_dep = df['Classified Departure Delay (5)']

y_dep_equal = df['Classified Departure Delay (5 Equal)']

X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(X_dep, y_dep, test_size=0.3, random_state=123)

X_train_dep_eq, X_test_dep_eq, y_train_dep_eq, y_test_dep_eq = train_test_split(X_dep, y_dep_equal, test_size=0.3, random_state=123)

## Using SMOTE

In [None]:
smote = SMOTE()

X_train_dep, y_train_dep, = smote.fit_resample(X_train_dep, y_train_dep)

In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train_dep, y_train_dep)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train_dep, y_train_dep)

y_pred_lr = lr.predict(X_test_dep)
y_pred_rfc = rfc.predict(X_test_dep)

In [None]:
lr_acc = accuracy_score(y_test_dep, y_pred_lr)
rfc_acc = accuracy_score(y_test_dep, y_pred_rfc)

lr_prec = precision_score(y_test_dep, y_pred_lr, average=None)
rfc_prec = precision_score(y_test_dep, y_pred_rfc, average=None)

lr_rec = recall_score(y_test_dep, y_pred_lr, average=None)
rfc_rec = recall_score(y_test_dep, y_pred_rfc, average=None)

lr_f1 = f1_score(y_test_dep, y_pred_lr, average=None)
rfc_f1 = f1_score(y_test_dep, y_pred_rfc, average=None)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)
print()
print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)
print()
print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)
print()
print("LR F1:", lr_f1)
print("RFC F1:", rfc_f1)

In [None]:
rfc_cm = confusion_matrix(y_test_dep, y_pred_rfc)
rfc_cm

In [None]:
fig = go.Figure(data=[go.Heatmap(z=rfc_cm, x=['class ' + str(i) for i in range(len(rfc_cm))],
                                  y=['class ' + str(i) for i in range(len(rfc_cm))],
                                  colorscale='viridis')])
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted outputs', yaxis_title='Actual outputs')
fig.show()

In [None]:
kappa = cohen_kappa_score(y_test_dep, y_pred_rfc, weights='quadratic')

fig = go.Figure(go.Indicator(
    mode = "number",
    value = kappa,
    title = {"text": "Cohen's kappa score"},
    domain = {'x': [0, 1], 'y': [0, 1]},
    ))
fig.show()

## Using Equal Binning

In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train_dep_eq, y_train_dep_eq)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train_dep_eq, y_train_dep_eq)

y_pred_lr = lr.predict(X_test_dep_eq)
y_pred_rfc = rfc.predict(X_test_dep_eq)

In [None]:
lr_acc = accuracy_score(y_test_dep_eq, y_pred_lr)
rfc_acc = accuracy_score(y_test_dep_eq, y_pred_rfc)

lr_prec = precision_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_prec = precision_score(y_test_dep_eq, y_pred_rfc, average=None)

lr_rec = recall_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_rec = recall_score(y_test_dep_eq, y_pred_rfc, average=None)

lr_f1 = f1_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_f1 = f1_score(y_test_dep_eq, y_pred_rfc, average=None)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)
print()
print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)
print()
print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)
print()
print("LR F1:", lr_f1)
print("RFC F1:", rfc_f1)

## Hyperparameter Tuning
* repeat with equal binning dataset

### XGBoost

In [None]:
# Baseline
xg_model0 = xgb.XGBClassifier(
    reg_lambda=3,
    learning_rate= 0.1,
    max_depth=5,
    n_estimators=5,
    objective='multi:softmax',
#   scale_pos_weight=0.2,
    booster='gbtree',
    eval_metric='auc',
    tree_method='hist',
    grow_policy='lossguide'
)
xg_model0.fit(X_train, y_train_dep)

xg_predictions0 = xg_model0.predict(X_test)
print(np.unique(xg_predictions0))
conf_matrix = confusion_matrix(y_test_dep, xg_predictions0)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=xg_model0.classes_)
cfmx_disp.plot()
plt.show()

In [None]:
# xg_model = xgb.XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
xg_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
#   num_boost_round=1000,
    booster='gbtree',
    eval_metric='auc',
    tree_method='hist',
    grow_policy='lossguide'
)
# xg_model_0.fit(X_train, y_train_dep)

In [None]:
param_grid = {
#   'n_estimators': [5, 10, 15, 20],
    'max_depth': [None, 4, 8, 12], # 16],
    'min_child_weight': [0, 1, 4, 8, 10]
#   'lambda': [3,5], # [3,5,7],
#   'scale_pos_weight': [0.1, 0.2, 0.4, 0.8, 1],
#   'objective': ['multi:softmax', 'multi:softprob'],
#   'objective': ['reg:logistic', 'binary:logistic'],
#   'learning_rate': [0.1, 0.05]
}

grid_search = GridSearchCV(xg_model, param_grid, cv=5, verbose=True)

# Time
print('Timer start.')
st = time.time()

# main program
grid_search.fit(X_train, y_train_dep)

et = time.time()
print("Best hyperparameters:", grid_search.best_params_)
print('Execution time:', et-st, 'seconds')

param_save = {}
param_save['max_depth'] = grid_search.best_params_['max_depth']
param_save['min_child_weight'] = grid_search.best_params_['min_child_weight']

In [None]:
xg_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
#   num_boost_round=1000,
    max_depth= param_save['max_depth'],
    min_child_weight= param_save['min_child_weight'],
    booster='gbtree',
    eval_metric='auc',
    tree_method='hist',
    grow_policy='lossguide'
)

param_grid = {
#   'n_estimators': [12], # [3, 6, 9, 12],
#   'max_depth': [None, 4, 8, 12, 16],
#   'min_child_weight': [0, 1, 4, 8, 10]
    'reg_alpha': [0,5,10], # [3,5,7],
    'reg_lambda': [0,5,10], # [3,5,7],
    'gamma': [0,4,8,12]
#   'scale_pos_weight': [0.1, 0.2, 0.4, 0.8, 1],
#   'objective': ['multi:softmax', 'multi:softprob'],
#   'objective': ['reg:logistic', 'binary:logistic'],
#   'learning_rate': [0.1, 0.05]
}

grid_search = GridSearchCV(xg_model, param_grid, cv=5, verbose=True)

# Time
print('Timer start.')
st = time.time()

# main program
grid_search.fit(X_train, y_train_dep)

et = time.time()
print("Best hyperparameters:", grid_search.best_params_)
print('Execution time:', et-st, 'seconds')

param_save['reg_alpha'] = 0 # grid_search.best_params_['reg_alpha']
param_save['reg_lambda'] = 0 # grid_search.best_params_['reg_lambda']
param_save['gamma'] = grid_search.best_params_['gamma']

In [None]:
xg_model = xgb.XGBClassifier(
    max_depth= param_save['max_depth'],
    min_child_weight= param_save['min_child_weight'],
    reg_alpha= param_save['reg_alpha'],
    reg_lambda= param_save['reg_lambda'],
    gamma= param_save['gamma'],
    booster='gbtree',
    eval_metric='auc',
    tree_method='hist',
    grow_policy='lossguide'
)

param_grid = {
    'n_estimators': [50, 100, 150],
#   'max_depth': [None, 4, 8, 12, 16],
#   'min_child_weight': [0, 1, 4, 8, 10]
#   'reg_alpha': [0,1,4,7,10], # [3,5,7],
#   'reg_lambda': [0,1,4,7,10], # [3,5,7],
#   'scale_pos_weight': [0.1, 0.2, 0.4, 0.8, 1],
#   'objective': ['multi:softmax', 'multi:softprob'],
#   'objective': ['reg:logistic', 'binary:logistic'],
    'learning_rate': [0.1, 0.04],
#   'num_boost_round': [400, 800, 1000, 1300]
}

grid_search = GridSearchCV(xg_model, param_grid, cv=5, verbose=True)

# Time
print('Timer start.')
st = time.time()

# main program
grid_search.fit(X_train, y_train)

et = time.time()
print("Best hyperparameters:", grid_search.best_params_)
print('Execution time:', et-st, 'seconds')

param_save['learning_rate'] = grid_search.best_params_['learning_rate']
param_save['n_estimators'] = grid_search.best_params_['n_estimators']

In [None]:
xg_model = xgb.XGBClassifier(
    n_estimators=param_save['n_estimators'],
    learning_rate= param_save['learning_rate'],
#   num_boost_round= param_save['num_boost_round']
    max_depth=  param_save['max_depth'],
    min_child_weight= param_save['min_child_weight'],
    reg_alpha= param_save['reg_alpha'],
    reg_lambda= param_save['reg_lambda'],
    gamma= param_save['gamma'],
    booster='gbtree',
    eval_metric='auc',
    tree_method='hist',
    grow_policy='lossguide'
)

xg_model.fit(X_train, y_train_dep)
xg_predictions = xg_model.predict(X_test)

In [None]:
y_test = y_test_dep

print(np.unique(xg_predictions))
conf_matrix = confusion_matrix(y_test, xg_predictions)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=xg_model.classes_)
cfmx_disp.plot()
plt.show()

acc = accuracy_score(y_test, xg_predictions)
# RF Precision and Recall
print('accuracy:', acc, conf_matrix[0][0] + conf_matrix[1][1] + conf_matrix[2][2] + conf_matrix[3][3] + conf_matrix[4][4])
print('Class 0:')
print('Precision:', conf_matrix[0][0]/(conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]))
print('Recall:', conf_matrix[0][0]/sum(conf_matrix[0]))
print('Class 1:')
print('Precision:', conf_matrix[1][1]/(conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1]))
print('Recall:', conf_matrix[1][1]/sum(conf_matrix[1]))

combined_nom = conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[1][0] + conf_matrix[1][1] 
combined_prec_denom = (conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]) + (conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1])
combined_rec_denom = sum(conf_matrix[0]) + sum(conf_matrix[1])
print('Combined Classes 0 and 1:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

print('Class 3:')
print('Precision:', conf_matrix[3][3]/(conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]))
print('Recall:', conf_matrix[3][3]/sum(conf_matrix[3]))
print('Class 4:')
print('Precision:', conf_matrix[4][4]/(conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4]))
print('Recall:', conf_matrix[4][4]/sum(conf_matrix[4]))

combined_nom = conf_matrix[3][3] + conf_matrix[3][4] + conf_matrix[4][3] + conf_matrix[4][4] 
combined_prec_denom = (conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]) + (conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4])
combined_rec_denom = sum(conf_matrix[3]) + sum(conf_matrix[4])
print('Combined Classes 3 and 4:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

## Catboost

In [None]:
# Baseline model
catBoostModel0 = CatBoostClassifier()
catBoostModel0.fit(X_train, y_train_dep)

In [None]:
cat_predictions0 = catBoostModel0.predict(X_test)

print(np.unique(cat_predictions0))
conf_matrix = confusion_matrix(y_test_dep, cat_predictions0)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=catBoostModel0.classes_)
cfmx_disp.plot()
plt.show()

In [None]:
catBoostModel = CatBoostClassifier(iterations=600)
# iterations: 500. bestiter is consistently low
cat_grid = {
    'learning_rate': [0.1, 0.04],
    'depth': [4, 9, 14],
    'l2_leaf_reg': [1, 3, 5, 7, 9]}

# Time
print('Timer start.')
st = time.time()

catBoostModel.grid_search(cat_grid, X_train, y_train)

et = time.time()
print('Execution time:', et-st, 'seconds')

# "After searching, the model is trained and ready to use."
# https://catboost.ai/en/docs/concepts/python-reference_catboost_grid_search

In [None]:
catBoostModel.fit(X_train,y_train_dep)
cat_predictions = catBoostModel.predict(X_test)

In [None]:
y_test = y_test_dep

print(np.unique(cat_predictions))
conf_matrix = confusion_matrix(y_test, cat_predictions)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=catBoostModel.classes_)
cfmx_disp.plot()
plt.show()

acc = accuracy_score(y_test, cat_predictions)
# RF Precision and Recall
print('accuracy:', acc, conf_matrix[0][0] + conf_matrix[1][1] + conf_matrix[2][2] + conf_matrix[3][3] + conf_matrix[4][4])
print('Class 0:')
print('Precision:', conf_matrix[0][0]/(conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]))
print('Recall:', conf_matrix[0][0]/sum(conf_matrix[0]))
print('Class 1:')
print('Precision:', conf_matrix[1][1]/(conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1]))
print('Recall:', conf_matrix[1][1]/sum(conf_matrix[1]))

combined_nom = conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[1][0] + conf_matrix[1][1] 
combined_prec_denom = (conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]) + (conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1])
combined_rec_denom = sum(conf_matrix[0]) + sum(conf_matrix[1])
print('Combined Classes 0 and 1:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

print('Class 3:')
print('Precision:', conf_matrix[3][3]/(conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]))
print('Recall:', conf_matrix[3][3]/sum(conf_matrix[3]))
print('Class 4:')
print('Precision:', conf_matrix[4][4]/(conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4]))
print('Recall:', conf_matrix[4][4]/sum(conf_matrix[4]))

combined_nom = conf_matrix[3][3] + conf_matrix[3][4] + conf_matrix[4][3] + conf_matrix[4][4] 
combined_prec_denom = (conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]) + (conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4])
combined_rec_denom = sum(conf_matrix[3]) + sum(conf_matrix[4])
print('Combined Classes 3 and 4:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

## Random Forest

In [None]:
rf_model0 = RandomForestClassifier(criterion="entropy", random_state=424)
rf_model0.fit(X_train, y_train_dep)

rf_predictions0 = rf_model0.predict(X_test)
print(np.unique(rf_predictions0))
conf_matrix = confusion_matrix(y_test_dep, rf_predictions0)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=rf_model0.classes_)
cfmx_disp.plot()
plt.show()

In [None]:
rf_model = RandomForestClassifier(criterion="entropy", random_state=424)
param_grid = {
    'n_estimators': [40, 100, 140],
    'max_depth': [None, 4, 8, 12, 16],
    'class_weight': ['balanced', 'balanced_subsample']
#   'min_samples_split': [70000],
}

grid_search_rf = GridSearchCV(rf_model, param_grid, cv=5, verbose=True)

# Time
print('Timer start.')
st = time.time()

# main program
grid_search_rf.fit(X_train, y_train_dep)

et = time.time()
print("Best hyperparameters:", grid_search_rf.best_params_)
print('Execution time:', et-st, 'seconds')
# rf_model0.fit(X_train, y_train)

In [None]:
rf_param_save = {}
rf_param_save['n_estimators'] = grid_search_rf.best_params_['n_estimators']
rf_param_save['max_depth'] = grid_search_rf.best_params_['max_depth']
rf_param_save['class_weight'] = grid_search_rf.best_params_['class_weight']
rf_param_save

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=rf_param_save['n_estimators'],
    max_depth=rf_param_save['max_depth'],
    class_weight=rf_param_save['class_weight'],
    criterion="entropy", random_state=424
)

rf_model.fit(X_train, y_train_dep)
rf_predictions = rf_model.predict(X_test)

In [None]:
y_test = y_test_dep

print(np.unique(rf_predictions))
conf_matrix = confusion_matrix(y_test, rf_predictions)
# print(conf_matrix)
cfmx_disp = ConfusionMatrixDisplay(conf_matrix, display_labels=rf_model.classes_)
cfmx_disp.plot()
plt.show()

acc = accuracy_score(y_test, rf_predictions)
# RF Precision and Recall
print('accuracy:', acc, conf_matrix[0][0] + conf_matrix[1][1] + conf_matrix[2][2] + conf_matrix[3][3] + conf_matrix[4][4])
print('Class 0:')
print('Precision:', conf_matrix[0][0]/(conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]))
print('Recall:', conf_matrix[0][0]/sum(conf_matrix[0]))
print('Class 1:')
print('Precision:', conf_matrix[1][1]/(conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1]))
print('Recall:', conf_matrix[1][1]/sum(conf_matrix[1]))

combined_nom = conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[1][0] + conf_matrix[1][1] 
combined_prec_denom = (conf_matrix[0][0]+conf_matrix[1][0]+conf_matrix[2][0]+conf_matrix[3][0]+conf_matrix[4][0]) + (conf_matrix[0][1]+conf_matrix[1][1]+conf_matrix[2][1]+conf_matrix[3][1]+conf_matrix[4][1])
combined_rec_denom = sum(conf_matrix[0]) + sum(conf_matrix[1])
print('Combined Classes 0 and 1:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

print('Class 3:')
print('Precision:', conf_matrix[3][3]/(conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]))
print('Recall:', conf_matrix[3][3]/sum(conf_matrix[3]))
print('Class 4:')
print('Precision:', conf_matrix[4][4]/(conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4]))
print('Recall:', conf_matrix[4][4]/sum(conf_matrix[4]))

combined_nom = conf_matrix[3][3] + conf_matrix[3][4] + conf_matrix[4][3] + conf_matrix[4][4] 
combined_prec_denom = (conf_matrix[0][3]+conf_matrix[1][3]+conf_matrix[2][3]+conf_matrix[3][3]+conf_matrix[4][3]) + (conf_matrix[4][4]+conf_matrix[3][4]+conf_matrix[2][4]+conf_matrix[1][4]+conf_matrix[0][4])
combined_rec_denom = sum(conf_matrix[3]) + sum(conf_matrix[4])
print('Combined Classes 3 and 4:')
print('Precision: ', combined_nom/combined_prec_denom)
print('Recall: ', combined_nom/combined_rec_denom)

# Neural Networks

In [None]:
y_dep.value_counts()

In [None]:
y_bins = 5

In [None]:
y_train_arr = y_train_dep.to_numpy()
y_test_arr = y_test_dep.to_numpy()
y_train = to_categorical(y_train_arr, y_bins)
y_test = to_categorical(y_test_arr, y_bins)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_dep, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
y_pred = np.argmax(model.predict(X_test_dep), axis=-1)

In [None]:
y_test = np.argmax(y_test, axis=-1)

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

## Using Smote

In [None]:
smote = SMOTE()

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dep, y_train_dep)

y_train_arr_resampled = y_train_resampled.to_numpy()

y_train_resampled = to_categorical(y_train_arr_resampled, y_bins)

In [22]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_split=0.2)

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd460785d10>

In [23]:
y_pred = np.argmax(model.predict(X_test_dep), axis=-1)
y_test = to_categorical(y_test_arr, y_bins)
y_test = np.argmax(y_test, axis=-1)



In [24]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

Accuracy: 0.9993252721106554
Class 0 Precision: 0.9665948275862069
Class 0 Recall: 0.999777084262149
Class 0 F1: 0.982905982905983

Class 1 Precision: 0.9999879861119454
Class 1 Recall: 0.9981412862299289
Class 1 F1: 0.9990637827976091

Class 2 Precision: 1.0
Class 2 Recall: 1.0
Class 2 F1: 1.0

Class 3 Precision: 0.9943404776366914
Class 3 Recall: 1.0
Class 3 F1: 0.9971622085462483

Class 4 Precision: 1.0
Class 4 Recall: 0.983036888265977
Class 4 F1: 0.9914458919879923



## Using Equal Binning

In [25]:
y_dep_equal.value_counts()

0    1476424
1    1435784
2    1315475
4    1258655
3    1084212
Name: Classified Departure Delay (5 Equal), dtype: int64

In [26]:
y_bins = 5

In [27]:
y_train_arr_eq = y_train_dep_eq.to_numpy()
y_test_arr_eq = y_test_dep_eq.to_numpy()

y_train_eq = to_categorical(y_train_arr_eq, y_bins)
y_test_eq = to_categorical(y_test_arr_eq, y_bins)

In [28]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_dep_eq, y_train_eq, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd460785a90>

In [29]:
y_pred = np.argmax(model.predict(X_test_dep_eq), axis=-1)

y_test = np.argmax(y_test_eq, axis=-1)



In [30]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

Accuracy: 1.0
Class 0 Precision: 1.0
Class 0 Recall: 1.0
Class 0 F1: 1.0

Class 1 Precision: 1.0
Class 1 Recall: 1.0
Class 1 F1: 1.0

Class 2 Precision: 1.0
Class 2 Recall: 1.0
Class 2 F1: 1.0

Class 3 Precision: 1.0
Class 3 Recall: 1.0
Class 3 F1: 1.0

Class 4 Precision: 1.0
Class 4 Recall: 1.0
Class 4 F1: 1.0



# Visualisations

## Fisher's Score

In [None]:
from skfeature.function.similarity_based.fisher_score import fisher_score

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.feature_selection import chi2


In [None]:
df.head()

In [None]:
X = df[[
    'Weather Code Intensity', 'Month', 'Irregular Departure Time',
    'Is Holiday', 'Is Labour Day', 'Is Xmas',
    'Origin Precipitation', 'Origin Windspeed',
    'Monthly Median Departure Delay', 'Airplane Age',
    'Origin Total Operations'
]]
y = df['Classified Departure Delay (5)']

In [None]:
fisher_scores = SelectKBest(score_func=f_classif, k='all').fit(X, y).scores_

fig = go.Figure()
fig.add_trace(go.Bar(x=list(X.columns), y=fisher_scores))

fig.update_layout(title="Fisher's Score of Features", xaxis_title='Features', yaxis_title="Fisher's Score")

fig.show()

In [None]:
sorted_features

In [None]:
fisher_scores

In [None]:
X.columns

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=list(X.columns), y=fisher_scores))

fig.update_layout(title="Fisher's Score of Features", xaxis_title='Features', yaxis_title="Fisher's Score")

fig.show()