In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split

# Regression Models
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Classification Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Regression Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Classification Metrics
from sklearn.metrics import cohen_kappa_score, confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

SyntaxError: trailing comma not allowed without surrounding parentheses (1309210781.py, line 20)

In [None]:
# !pip3 install --upgrade plotly

In [None]:
df = pd.read_csv("../scripts/eda_2018.csv")

In [None]:
df.head()

# Regression Models

In [None]:
features = [
    'Origin Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall', 
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration',
    'Origin Shortwave Radiation',
]

X = df[features]
y = df['Delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [None]:
linear = LinearRegression()
linear.fit(X_train, y_train)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=123)
gbr.fit(X_train, y_train)

rfr = RandomForestRegressor(n_estimators=100, max_depth=1, random_state=123)
rfr.fit(X_train, y_train)

y_pred_linear = linear.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_gbr = gbr.predict(X_test)
y_pred_rfr = rfr.predict(X_test)

In [None]:
mse_linear = mean_squared_error(y_test, y_pred_linear, squared=False)
r2_linear = r2_score(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)

mse_lasso = mean_squared_error(y_test, y_pred_lasso, squared=False)
r2_lasso = r2_score(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)

mse_gbr = mean_squared_error(y_test, y_pred_gbr, squared=False)
r2_gbr = r2_score(y_test, y_pred_gbr)
mae_gbr = mean_absolute_error(y_test, y_pred_gbr)

mse_rfr = mean_squared_error(y_test, y_pred_rfr, squared=False)
r2_rfr = r2_score(y_test, y_pred_rfr)
mae_rfr = mean_absolute_error(y_test, y_pred_rfr)


fig = go.Figure()
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mse_linear, mse_lasso, mse_gbr, mse_rfr],
                     name='Root Mean Squared Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[mae_linear, mae_lasso, mae_gbr, mae_rfr],
                     name='Mean Absolute Error',
                     text=[f'{mse_linear:.5f}', f'{mse_lasso:.5f}', f'{mse_gbr:.5f}', f'{mse_rfr:.5f}'],
                     textposition='auto'))
fig.add_trace(go.Bar(x=['Linear Regression', 'Lasso Regression', 'Gradient Boosting Regression', 'Random Forest Regression'],
                     y=[r2_linear, r2_lasso, r2_gbr, r2_rfr],
                     name='R-squared',
                     text=[f'{r2_linear:.5f}', f'{r2_lasso:.5f}', f'{r2_gbr:.5f}', f'{r2_rfr:.5f}'],
                     textposition='auto'))
fig.update_layout(title='Linear vs Lasso vs Gradient Boosting vs Random Forest Regression Performance Metrics',
                  xaxis_title='Regression Model',
                  yaxis_title='Value')

fig.show()

In [None]:
# Get the coefficients
coef = pd.Series(linear.coef_, index = X.columns)

fig = px.bar(coef, x=coef.index, y=coef.values)

fig.update_layout(
    title='Linear Regression Coefficients',
    xaxis_title='Features',
    yaxis_title='Coefficient Values'
)

fig.show()

# Classification Models

In [2]:
features = [
    'Origin Total Operations',
    'Origin Precipitation', 'Origin Rain', 'Origin Snowfall',
    'Origin Windspeed', 'Origin Windgusts', 'Origin Evapotranspiration'
]

X = df[features]
y = df['Classified Delay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

NameError: name 'df' is not defined

In [None]:
lr = LogisticRegression(max_iter=300, random_state=42)
lr.fit(X_train, y_train)

rfc = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
rfc.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)
y_pred_rfc = rfc.predict(X_test)

In [None]:
lr_acc = accuracy_score(y_test, y_pred_lr)
rfc_acc = accuracy_score(y_test, y_pred_rfc)

lr_prec = precision_score(y_test, y_pred_lr)
rfc_prec = precision_score(y_test, y_pred_rfc)

lr_rec = recall_score(y_test, y_pred_lr)
rfc_rec = recall_score(y_test, y_pred_rfc)

lr_f1 = f1_score(y_test, y_pred_lr)
rfc_f1 = f1_score(y_test, y_pred_rfc)

print("LR Accuracy:", lr_acc)
print("RFC Accuracy:", rfc_acc)

print("LR Precision:", lr_prec)
print("RFC Precision:", rfc_prec)

print("LR Recall:", lr_rec)
print("RFC Recall:", rfc_rec)

print("LR F1 Score:", lr_f1)
print("RFC F1 Score:", rfc_f1)

In [None]:
rfc_cm = confusion_matrix(y_test, y_pred_rfc)
rfc_cm

In [None]:
fig = go.Figure(data=[go.Heatmap(z=rfc_cm, x=['class ' + str(i+1) for i in range(len(cm))],
                                  y=['class ' + str(i+1) for i in range(len(cm))],
                                  colorscale='viridis')])
fig.update_layout(title='Confusion Matrix', xaxis_title='Predicted outputs', yaxis_title='Actual outputs')
fig.show()

In [None]:
kappa = cohen_kappa_score(y_test, y_pred_rfc, weights='quadratic')

fig = go.Figure(go.Indicator(
    mode = "number",
    value = kappa,
    title = {"text": "Cohen's kappa score"},
    domain = {'x': [0, 1], 'y': [0, 1]},
    ))
fig.show()