In [None]:
# 2 next lines only for google collab
from google.colab import drive

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report,
    roc_auc_score, balanced_accuracy_score,
)

try:
    from imblearn.over_sampling import RandomOverSampler
    from imblearn.pipeline import Pipeline as ImbPipeline
    IMB_INSTALLED = True
except ImportError:
    RandomOverSampler = None
    ImbPipeline = None
    IMB_INSTALLED = False

RANDOM_STATE = 42
USE_OVERSAMPLING = True  # enable RandomOverSampler if imblearn is available

In [None]:
df = pd.read_csv("flights_prepared.csv")
print("Loaded CSV with shape:", df.shape)
TARGET = "delay_15"
DATE_COL = "FL_DATE"

In [None]:
df = df.sort_values(by=DATE_COL)

train_df, temp_df = train_test_split(df, test_size=0.30, shuffle=False)
val_df, test_df = train_test_split(temp_df, test_size=0.50, shuffle=False)

print("Split sizes -> train:", len(train_df), "val:", len(val_df), "test:", len(test_df))

X_train = train_df.drop(columns=[TARGET])
y_train = train_df[TARGET]

X_val = val_df.drop(columns=[TARGET])
y_val = val_df[TARGET]

X_test = test_df.drop(columns=[TARGET])
y_test = test_df[TARGET]

In [None]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            SkPipeline(
                [
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numeric_features,
        ),
        (
            "cat",
            SkPipeline(
                [
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore")),
                ]
            ),
            categorical_features,
        ),
    ]
)

print("Preprocessing steps defined for numeric and categorical features.")

In [None]:
all_metrics = []
def train_and_eval(model, model_name):
    steps = [("preprocessor", preprocessor)]
    pipeline_cls = SkPipeline
    if USE_OVERSAMPLING and IMB_INSTALLED:
        steps.append(("oversample", RandomOverSampler(random_state=RANDOM_STATE)))
        pipeline_cls = ImbPipeline
    elif USE_OVERSAMPLING and not IMB_INSTALLED:
        print(f"imblearn not installed; running {model_name} without oversampling.")

    steps.append(("model", model))
    pipe = pipeline_cls(steps=steps)

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)

    try:
        proba = pipe.predict_proba(X_val)[:, 1]
    except AttributeError:
        proba = None

    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, zero_division=0)
    rec = recall_score(y_val, preds, zero_division=0)
    f1 = f1_score(y_val, preds, zero_division=0)
    bal_acc = balanced_accuracy_score(y_val, preds)

    cm = confusion_matrix(y_val, preds)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan

    if proba is not None:
        roc_auc = roc_auc_score(y_val, proba)
    else:
        roc_auc = np.nan

    print(f"\n===== {model_name} === পল্ল  ")
    print("Accuracy           :", acc)
    print("Balanced Accuracy  :", bal_acc)
    print("Precision (positive class=1):", prec)
    print("Recall (Sensitivity):", rec)
    print("Specificity        :", specificity)
    print("F1 Score           :", f1)
    print("ROC-AUC            :", roc_auc)
    print("Confusion Matrix:\n", cm)
    print("\nClassification report:\n",
          classification_report(y_val, preds, digits=3))

    # store metrics for summary
    all_metrics.append({
        "model": model_name,
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "specificity": specificity,
        "f1": f1,
        "roc_auc": roc_auc,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp,
    })

    return pipe, cm

print("The 'train_and_eval' function has been defined.")

In [None]:
def plot_cm(cm, title):
    plt.imshow(cm, cmap="Blues")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center", color="black")

    plt.tight_layout()
    plt.show()

print("The 'plot_cm' function has been defined.")

In [None]:
lr_model, lr_cm = train_and_eval(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    "Logistic Regression"
)
plot_cm(lr_cm, "Logistic Regression CM")


In [None]:
dt_model, dt_cm = train_and_eval(
    DecisionTreeClassifier(max_depth=12, class_weight="balanced", random_state=RANDOM_STATE),
    "Decision Tree"
)
plot_cm(dt_cm, "Decision Tree CM")

In [None]:
rf_model, rf_cm = train_and_eval(
    RandomForestClassifier(
        n_estimators=50,
        max_depth=20,
        random_state=RANDOM_STATE,
        class_weight="balanced",
        n_jobs=-1
    ),
    "Random Forest"
)
plot_cm(rf_cm, "Random Forest CM")

In [None]:
mlp_model, mlp_cm = train_and_eval(
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        max_iter=500,
        random_state=RANDOM_STATE,
        early_stopping=True
    ),
    "MLP Classifier"
)
plot_cm(mlp_cm, "MLP Classifier CM")

In [None]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df = metrics_df.set_index('model')
display(metrics_df.round(3))


In [None]:
from imblearn.over_sampling import SMOTE
print("SMOTE imported successfully.")

In [None]:
def train_and_eval_with_smote(model, model_name):
    steps = [("preprocessor", preprocessor)]

    if IMB_INSTALLED:
        steps.append(("smote", SMOTE(random_state=RANDOM_STATE)))
    else:
        print(f"imblearn not installed; running {model_name} without SMOTE.")

    steps.append(("model", model))
    pipe = ImbPipeline(steps=steps)

    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)

    try:
        proba = pipe.predict_proba(X_val)[:, 1]
    except AttributeError:
        proba = np.nan

    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, zero_division=0)
    rec = recall_score(y_val, preds, zero_division=0)
    f1 = f1_score(y_val, preds, zero_division=0)
    bal_acc = balanced_accuracy_score(y_val, preds)

    cm = confusion_matrix(y_val, preds)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan

    roc_auc = roc_auc_score(y_val, proba) if not np.isnan(proba).any() else np.nan

    print(f"\n===== {model_name} (with SMOTE) === পল্ল  ")
    print("Accuracy           :", acc)
    print("Balanced Accuracy  :", bal_acc)
    print("Precision (positive class=1):", prec)
    print("Recall (Sensitivity):", rec)
    print("Specificity        :", specificity)
    print("F1 Score           :", f1)
    print("ROC-AUC            :", roc_auc)
    print("Confusion Matrix:\n", cm)
    print("\nClassification report:\n",
          classification_report(y_val, preds, digits=3))

    # store metrics for summary
    all_metrics.append({
        "model": model_name + " (SMOTE)",
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "specificity": specificity,
        "f1": f1,
        "roc_auc": roc_auc,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp,
    })

    return pipe, cm

print("The 'train_and_eval_with_smote' function has been defined.")

In [None]:
lr_smote_model, lr_smote_cm = train_and_eval_with_smote(
    LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE),
    "Logistic Regression"
)
plot_cm(lr_smote_cm, "Logistic Regression (SMOTE) CM")

In [None]:
dt_smote_model, dt_smote_cm = train_and_eval_with_smote(
    DecisionTreeClassifier(max_depth=12, class_weight="balanced", random_state=RANDOM_STATE),
    "Decision Tree"
)
plot_cm(dt_smote_cm, "Decision Tree (SMOTE) CM")

In [None]:
rf_smote_model, rf_smote_cm = train_and_eval_with_smote(
    RandomForestClassifier(
        n_estimators=50,
        max_depth=20,
        random_state=RANDOM_STATE,
        class_weight="balanced",
        n_jobs=-1
    ),
    "Random Forest"
)
plot_cm(rf_smote_cm, "Random Forest (SMOTE) CM")

In [None]:
mlp_smote_model, mlp_smote_cm = train_and_eval_with_smote(
    MLPClassifier(
        hidden_layer_sizes=(100, 50),
        max_iter=500,
        random_state=RANDOM_STATE,
        early_stopping=True
    ),
    "MLP Classifier"
)
plot_cm(mlp_smote_cm, "MLP Classifier (SMOTE) CM")

In [None]:
metrics_df = pd.DataFrame(all_metrics)
metrics_df = metrics_df.set_index('model')
display(metrics_df.round(3))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
print("RandomizedSearchCV imported successfully.")

In [None]:
lr_param_dist = {
    'model__C': np.logspace(-4, 4, 20),
    'model__solver': ['liblinear', 'saga']
}

dt_param_dist = {
    'model__max_depth': [None, 10, 20, 30, 40, 50],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

rf_param_dist = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

mlp_param_dist = {
    'model__hidden_layer_sizes': [(50,), (100,), (50, 25), (100, 50)],
    'model__alpha': np.logspace(-5, -1, 5),
    'model__learning_rate_init': [0.001, 0.01, 0.1],
    'model__solver': ['adam', 'sgd']
}

print("Hyperparameter distributions for models have been defined.")

In [None]:
def tune_and_eval_model(base_model, param_dist, model_name, use_smote=True):
    steps = [("preprocessor", preprocessor)]
    pipeline_cls = SkPipeline

    if use_smote and IMB_INSTALLED:
        steps.append(("smote", SMOTE(random_state=RANDOM_STATE)))
        pipeline_cls = ImbPipeline
    elif use_smote and not IMB_INSTALLED:
        print(f"imblearn not installed; running {model_name} without SMOTE for tuning.")

    steps.append(("model", base_model))
    pipe = pipeline_cls(steps=steps)

    print(f"\nStarting RandomizedSearchCV for {model_name}...")
    random_search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=10,
        cv=3,
        scoring='f1',
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    random_search.fit(X_train, y_train)

    best_estimator = random_search.best_estimator_
    preds = best_estimator.predict(X_val)

    try:
        proba = best_estimator.predict_proba(X_val)[:, 1]
    except AttributeError:
        proba = np.nan

    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, zero_division=0)
    rec = recall_score(y_val, preds, zero_division=0)
    f1 = f1_score(y_val, preds, zero_division=0)
    bal_acc = balanced_accuracy_score(y_val, preds)

    cm = confusion_matrix(y_val, preds)
    tn, fp, fn, tp = cm.ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan

    if not np.isnan(proba).any() and proba is not None:
        roc_auc = roc_auc_score(y_val, proba)
    else:
        roc_auc = np.nan

    tuned_model_name = f"{model_name} (Tuned)"
    if use_smote and IMB_INSTALLED:
        tuned_model_name += " (SMOTE)"

    print(f"\n===== {tuned_model_name} === পল্ল  ")
    print("Best Parameters:", random_search.best_params_)
    print("Accuracy           :", acc)
    print("Balanced Accuracy  :", bal_acc)
    print("Precision (positive class=1):", prec)
    print("Recall (Sensitivity):", rec)
    print("Specificity        :", specificity)
    print("F1 Score           :", f1)
    print("ROC-AUC            :", roc_auc)
    print("Confusion Matrix:\n", cm)
    print("\nClassification report:\n",
          classification_report(y_val, preds, digits=3))

    all_metrics.append({
        "model": tuned_model_name,
        "accuracy": acc,
        "balanced_accuracy": bal_acc,
        "precision": prec,
        "recall": rec,
        "specificity": specificity,
        "f1": f1,
        "roc_auc": roc_auc,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "tp": tp,
    })

    plot_cm(cm, f"{tuned_model_name} CM")

    return best_estimator, cm

print("The 'tune_and_eval_model' function has been defined.")