In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

In [None]:
# Regression Models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# ANN
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical

# Regression Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Classification Metrics
from sklearn.metrics import cohen_kappa_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import precision_recall_fscore_support

In [None]:
# !pip3 install --upgrade plotly

In [3]:
df = pd.read_csv("../eda/model_2018.csv")

In [None]:
print(df.shape)
df.head()

# Regression Models

In [None]:
dep_delay_features = [
    'Airplane Age', 'Distance', 
    'Origin Total Operations','Destination Total Operations',
    'PCA Origin Weather 1', 'PCA Origin Weather 2',
    'PCA Dest Weather 1', 'PCA Dest Weather 2',
    'Morning Dep Time', 'Regular Dep Time', 'Night Dep Time',
    'Monthly Median Departure Delay',
]

X_dep = df[dep_delay_features]

y_dep = df['Departure Delay']

X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(X_dep, y_dep, test_size=0.3, random_state=123)

In [None]:
print(X_dep.shape)
print(y_dep.shape)

In [None]:
linear = LinearRegression()
linear.fit(X_train_dep, y_train_dep)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_dep, y_train_dep)

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=123)
gbr.fit(X_train_dep, y_train_dep)

rfr = RandomForestRegressor(n_estimators=100, max_depth=1, random_state=123)
rfr.fit(X_train_dep, y_train_dep)

y_pred_linear = linear.predict(X_test_dep)
y_pred_lasso = lasso.predict(X_test_dep)
y_pred_gbr = gbr.predict(X_test_dep)
y_pred_rfr = rfr.predict(X_test_dep)

In [None]:
mse_linear = mean_squared_error(y_test_dep, y_pred_linear, squared=False)
r2_linear = r2_score(y_test_dep, y_pred_linear)
mae_linear = mean_absolute_error(y_test_dep, y_pred_linear)

mse_lasso = mean_squared_error(y_test_dep, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test_dep, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test_dep, y_pred_lasso)

mse_gbr = mean_squared_error(y_test_dep, y_pred_gbr, squared=False)
r2_gbr = r2_score(y_test_dep, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test_dep, y_pred_gbr)

mse_rfr = mean_squared_error(y_test_dep, y_pred_rfr, squared=False)
r2_rfr = r2_score(y_test_dep, y_pred_rfr)
mae_rfr = mean_absolute_error(y_test_dep, y_pred_rfr)


fig = go.Figure()
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mse_linear, mse_lasso, mse_gbr, mse_rfr],
                     name='Root Mean Squared Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mae_linear, mae_lasso, mae_gbr, mae_rfr],
                     name='Mean Absolute Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[r2_linear, r2_lasso, r2_gbr, r2_rfr],
                     name='R-squared',
                     text=[f'{r2_linear:.5f}', f'{r2_lasso:.5f}', f'{r2_gbr:.5f}', f'{r2_rfr:.5f}'],
                     textposition='auto'))
fig.update_layout(title='Linear vs Lasso vs Gradient Boosting vs Random Forest Regression Performance Metrics',
                  xaxis_title='Regression Model',
                  yaxis_title='Value')

fig.show()

In [None]:
# Get the coefficients
coef = pd.Series(linear.coef_, index= X_dep.columns)

fig = px.bar(coef, x=coef.index, y=coef.values)

fig.update_layout(
    title='Linear Regression Coefficients',
    xaxis_title='Features',
    yaxis_title='Coefficient Values'
)

fig.show()

# Classification Models

In [4]:
dep_delay_features = [
    'Airplane Age', 'Distance', 
    'Origin Total Operations','Destination Total Operations',
    'PCA Origin Weather 1', 'PCA Origin Weather 2',
    'Morning Dep Time', 'Regular Dep Time', 'Night Dep Time',
    'Monthly Median Departure Delay'
]

X_dep = df[dep_delay_features]

y_dep = df['Classified Departure Delay']
y_dep_equal = df['Classified Departure Delay (Equal)']

X_train_dep, X_test_dep, y_train_dep, y_test_dep = train_test_split(X_dep, y_dep, test_size=0.3, random_state=123)

X_train_dep_eq, X_test_dep_eq, y_train_dep_eq, y_test_dep_eq = train_test_split(X_dep, y_dep_equal, test_size=0.3, random_state=123)

## Using SMOTE

In [None]:
smote = SMOTE()

X_train_dep, y_train_dep, = smote.fit_resample(X_train_dep, y_train_dep)

In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train_dep, y_train_dep)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train_dep, y_train_dep)

y_pred_lr = lr.predict(X_test_dep)
y_pred_rfc = rfc.predict(X_test_dep)

In [None]:
lr_acc = accuracy_score(y_test_dep, y_pred_lr)
rfc_acc = accuracy_score(y_test_dep, y_pred_rfc)

lr_prec = precision_score(y_test_dep, y_pred_lr, average=None)
rfc_prec = precision_score(y_test_dep, y_pred_rfc, average=None)

lr_rec = recall_score(y_test_dep, y_pred_lr, average=None)
rfc_rec = recall_score(y_test_dep, y_pred_rfc, average=None)

lr_f1 = f1_score(y_test_dep, y_pred_lr, average=None)
rfc_f1 = f1_score(y_test_dep, y_pred_rfc, average=None)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)
print()
print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)
print()
print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)
print()
print("LR F1:", lr_f1)
print("RFC F1:", rfc_f1)

In [None]:
rfc_cm = confusion_matrix(y_test_dep, y_pred_rfc)
rfc_cm

In [None]:
fig = go.Figure(data=[go.Heatmap(z=rfc_cm, x=['class ' + str(i) for i in range(len(rfc_cm))],
                                  y=['class ' + str(i) for i in range(len(rfc_cm))],
                                  colorscale='viridis')])
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted outputs', yaxis_title='Actual outputs')
fig.show()

In [None]:
kappa = cohen_kappa_score(y_test_dep, y_pred_rfc, weights='quadratic')

fig = go.Figure(go.Indicator(
    mode = "number",
    value = kappa,
    title = {"text": "Cohen's kappa score"},
    domain = {'x': [0, 1], 'y': [0, 1]},
    ))
fig.show()

## Using Equal Binning

In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train_dep_eq, y_train_dep_eq)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train_dep_eq, y_train_dep_eq)

y_pred_lr = lr.predict(X_test_dep_eq)
y_pred_rfc = rfc.predict(X_test_dep_eq)

In [None]:
lr_acc = accuracy_score(y_test_dep_eq, y_pred_lr)
rfc_acc = accuracy_score(y_test_dep_eq, y_pred_rfc)

lr_prec = precision_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_prec = precision_score(y_test_dep_eq, y_pred_rfc, average=None)

lr_rec = recall_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_rec = recall_score(y_test_dep_eq, y_pred_rfc, average=None)

lr_f1 = f1_score(y_test_dep_eq, y_pred_lr, average=None)
rfc_f1 = f1_score(y_test_dep_eq, y_pred_rfc, average=None)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)
print()
print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)
print()
print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)
print()
print("LR F1:", lr_f1)
print("RFC F1:", rfc_f1)

# Neural Networks

In [None]:
y_dep.value_counts()

In [None]:
y_bins = 3

In [None]:
y_train_arr = y_train_dep.to_numpy()
y_test_arr = y_test_dep.to_numpy()
y_train = to_categorical(y_train_arr, y_bins)
y_test = to_categorical(y_test_arr, y_bins)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train_dep, y_train, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
y_pred = np.argmax(model.predict(X_test_dep), axis=-1)

In [None]:
y_test = np.argmax(y_test, axis=-1)

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

## Using Smote

In [None]:
smote = SMOTE()

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_dep, y_train_dep)

y_train_arr_resampled = y_train_resampled.to_numpy()

y_train_resampled = to_categorical(y_train_arr_resampled, y_bins)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_resampled, y_train_resampled, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
y_pred = np.argmax(model.predict(X_test_dep), axis=-1)
y_test = to_categorical(y_test_arr, y_bins)
y_test = np.argmax(y_test, axis=-1)

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

## Using Equal Binning

In [None]:
y_dep_equal.value_counts()

In [None]:
y_bins = 4

In [None]:
y_train_arr_eq = y_train_dep_eq.to_numpy()
y_test_arr_eq = y_test_dep_eq.to_numpy()

y_train_eq = to_categorical(y_train_arr_eq, y_bins)
y_test_eq = to_categorical(y_test_arr_eq, y_bins)

In [None]:
model = Sequential()
model.add(Dense(32, input_dim=len(dep_delay_features)))
model.add(Activation('relu'))
model.add(Dense(y_bins))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train_dep_eq, y_train_eq, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
y_pred = np.argmax(model.predict(X_test_dep_eq), axis=-1)

y_test = np.argmax(y_test_eq, axis=-1)

In [None]:
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average=None)
rec = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

print("Accuracy:", acc)
for i in range(y_bins):  
    print(f"Class {i} Precision: {prec[i]}")
    print(f"Class {i} Recall: {rec[i]}")
    print(f"Class {i} F1: {f1[i]}")
    print()

# Visualisations

In [5]:
from skfeature.function.similarity_based.fisher_score import fisher_score

from sklearn.feature_selection import mutual_info_classif

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
