In [None]:
# ============================================================
# 1.5.3 â€” ALGORITHM EVALUATION & BEST MODEL SELECTION
# ============================================================

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, auc
)

import pandas as pd

# --------------------------------------------
# DEFINE ALL MODELS
# --------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=500, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(probability=True, class_weight="balanced", random_state=42)
}

results = []

# --------------------------------------------
# TRAIN + EVALUATE EACH MODEL
# --------------------------------------------
for name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    # PR-AUC
    pr_prec, pr_rec, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(pr_rec, pr_prec)

    results.append([name, precision, recall, f1, roc_auc, pr_auc])

# --------------------------------------------
# CREATE RESULTS TABLE
# --------------------------------------------
results_df = pd.DataFrame(results, columns=[
    "Model", "Precision", "Recall", "F1", "ROC-AUC", "PR-AUC"
])

print("=== MODEL PERFORMANCE ===")
display(results_df)

# --------------------------------------------
# SELECT BEST MODEL BASED ON PR-AUC
# --------------------------------------------
best_model_row = results_df.loc[results_df["PR-AUC"].idxmax()]
best_model_name = best_model_row["Model"]

print("\n====================================")
print(f" BEST MODEL: {best_model_name}")
print(" (Selection based on highest PR-AUC)")
print("====================================")
