In [None]:
import sys
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))


In [None]:

from src.config import DATA_PROCESSED
from src.model_training import (
    make_split,
    train_logistic_regression,
    # train_random_forest,
    train_random_forest_tuned,
    train_hgb,
    evaluate_for_lift,
    save_model,
)

In [None]:

df = pd.read_csv(DATA_PROCESSED)
df.shape

In [None]:

split = make_split(df, test_size=0.2, random_state=42)
split.X_train.shape, split.X_test.shape

In [None]:
X = split.X_train.copy()

na_counts = X.isna().sum().sort_values(ascending=False)
na_counts = na_counts[na_counts > 0]

print("Columns with NaN:", len(na_counts))
display(na_counts)

display(X[X.isna().any(axis=1)].head(10))


In [None]:

lr = train_logistic_regression(split.X_train, split.y_train, class_weight=None)

lr_bal = train_logistic_regression(
    split.X_train, split.y_train, class_weight="balanced"
)

rf = train_random_forest_tuned(split.X_train, split.y_train, class_weight=None)
rf_bal_sub = train_random_forest_tuned(
    split.X_train, split.y_train, class_weight="balanced_subsample"
)
# optional third:
rf_bal = train_random_forest_tuned(
    split.X_train, split.y_train, class_weight="balanced"
)

hgb = train_hgb(split.X_train, split.y_train)

models = {
    "logistic_regression": lr,
    "logistic_regression_balanced": lr_bal,
    "random_forest": rf,
    "random_forest_balanced": rf_bal,
    "random_forest_balanced_subsample": rf_bal_sub,
    "hist_gradient_boosting": hgb,
}

In [None]:

y_pred_proba = lr_bal.predict_proba(split.X_test)[:, 1]


In [None]:
# held-out test set
rows = []
for name, m in models.items():
    m10 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.10)
    m20 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.20)
    rows.append({
        "model": name,
        "baseline": m10["baseline"],
        "lift@10%": m10["lift_k"],
        "precision@10%": m10["precision_k"],
        "recall@10%": m10["recall_k"],
        "lift@20%": m20["lift_k"],
        "precision@20%": m20["precision_k"],
        "recall@20%": m20["recall_k"],
        "pr_auc": m10["pr_auc"],
    })


results = pd.DataFrame(rows).sort_values(["lift@10%", "lift@20%", "pr_auc"], ascending=False)
results.to_csv(PROJECT_ROOT / "model_comparison_results.csv", index=False)
results

In [None]:
# Save the best model by Lift@10%
first_name = results.iloc[0]["model"]
first_model = models[first_name]
out_path = save_model(first_model, first_name)

In [None]:

# Required columns
score_cols = ["lift@10%", "lift@20%", "pr_auc"]
missing = [c for c in score_cols if c not in results.columns]
if missing:
    raise ValueError(
        f"results is missing required columns: {missing}. Available: {list(results.columns)}"
    )

# Min-max normalize each metric (0..1), higher is better
results_scored = results.copy()

for c in score_cols:
    cmin, cmax = results_scored[c].min(), results_scored[c].max()
    if cmax == cmin:
        results_scored[c + "_norm"] = 0.0
    else:
        results_scored[c + "_norm"] = (results_scored[c] - cmin) / (cmax - cmin)

# Weighted score
w10, w20, wpr = 0.60, 0.30, 0.10
results_scored["weighted_score"] = (
    w10 * results_scored["lift@10%_norm"]
    + w20 * results_scored["lift@20%_norm"]
    + wpr * results_scored["pr_auc_norm"]
)

# Sort and save top 3
results_sorted = results_scored.sort_values(
    "weighted_score", ascending=False
).reset_index(drop=True)

top3 = results_sorted.loc[
    :2, ["model", "weighted_score", "lift@10%", "lift@20%", "pr_auc"]
]

best_name = results_sorted.loc[0, "model"]
out_path = save_model(models[best_name], best_name)

best_name_2 = results_sorted.loc[1, "model"]
out_path_2 = save_model(models[best_name_2], best_name_2)

best_name_3 = results_sorted.loc[2, "model"]
out_path_3 = save_model(models[best_name_3], best_name_3)
