#### Machine Learning

**Random Forest**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
df_airlines = pd.read_csv('../data/ontime_2025.csv')
df_airlines.head(5)

In [None]:
df_airlines['is_delayed'] = df_airlines['dep_del15']

In [None]:
df_ml = df_airlines[df_airlines['cancelled'] == 0].copy()

In [None]:
df_ml_numeric = df_ml[['month', 'day_of_month', 'day_of_week']]

df_ml_cat = df_ml[
    [
        'dep_bin',
        'season',
        'origin',
        'dest',
        'op_unique_carrier'
    ]
]

*Scale*

In [None]:
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, df_ml_numeric.columns)
    ]
)

ml_numeric_scaled = preprocessor.fit_transform(df_ml_numeric)

df_ml_numeric_scaled = pd.DataFrame(
    ml_numeric_scaled,
    columns=df_ml_numeric.columns,
    index=df_ml_numeric.index
)

In [None]:
df_ml_numeric_scaled

In [None]:
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, df_ml_cat.columns)
    ]
)

ml_cat_scaled = preprocessor.fit_transform(df_ml_cat)

feature_names = preprocessor.get_feature_names_out()

df_ml_cat_scaled = pd.DataFrame(
    ml_cat_scaled.toarray() if hasattr(ml_cat_scaled, "toarray") else ml_cat_scaled,
    columns=feature_names,
    index=df_ml_cat.index
)

In [None]:
df_ml_concat = pd.concat([df_ml_numeric_scaled, df_ml_cat_scaled], axis=1)

In [None]:
features = [
    'month',
    'day_of_month',
    'day_of_week',
    'dep_bin',
    'season',
    'origin',
    'dest',
    'op_unique_carrier'
]

target = 'is_delayed'

X = df_ml[features]
y = df_ml[target]

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=20,
        max_depth=6,
        min_samples_leaf=500,
        n_jobs=-1,
        random_state=42
    ))
])

*Split into Training and Testing sets*

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

*Fit the model*

In [None]:
rf.fit(X_train, y_train)

In [None]:
X_train

*Predict and Evaluate*

In [None]:
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

*Visualize the performance*

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(rf, X_test, y_test, cmap='Blues')
plt.title("Flight Delay Prediction Performance")
plt.show()

In [None]:
import joblib

joblib.dump(rf, "rf_model.pkl")

#### Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

over_sampling = RandomOverSampler(random_state=42)
X_over, y_over = over_sampling.fit_resample(X_train, y_train)

rf_over = RandomForestClassifier(n_estimators=20,
        max_depth=6,
        min_samples_leaf=500,
        n_jobs=-1,
        random_state=42).fit(X_over, y_over)
print(f"Oversampling Recall: {recall_score(y_test, lr_over.predict(X_test))}")

#### Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

uner_sampling = RandomUnderSampler(random_state=42)
X_under, y_under = uner_sampling.fit_resample(X_train, y_train)

rf_under = RandomForestClassifier(n_estimators=20,
        max_depth=6,
        min_samples_leaf=500,
        n_jobs=-1,
        random_state=42).fit(X_under, y_under)
print(f"Undersampling Recall: {recall_score(y_test, lr_under.predict(X_test))}")

#### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

rf_smote = RandomForestClassifier(n_estimators=20,
        max_depth=6,
        min_samples_leaf=500,
        n_jobs=-1,
        random_state=42).fit(X_smote, y_smote)
print(f"SMOTE Recall: {recall_score(y_test, lr_smote.predict(X_test))}")