In [None]:
import sys
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src exists:", (PROJECT_ROOT / "src").exists())

PROJECT_ROOT: C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet
src exists: True


In [None]:
from src.config import DATA_PROCESSED
from src.model_training import (
    make_split,
    train_logistic_regression,
    train_random_forest,
    train_hgb,
    evaluate_for_lift,
    save_model,
)

In [None]:
df = pd.read_csv(DATA_PROCESSED)
df.shape

(7043, 28)

In [None]:
split = make_split(df, test_size=0.2, random_state=42)
split.X_train.shape, split.X_test.shape

((5634, 26), (1409, 26))

In [None]:
X = split.X_train.copy()

na_counts = X.isna().sum().sort_values(ascending=False)
na_counts = na_counts[na_counts > 0]

print("Columns with NaN:", len(na_counts))
display(na_counts)

display(X[X.isna().any(axis=1)].head(10))

Columns with NaN: 0


Series([], dtype: int64)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,charges_ratio,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,HighSpender,HighChurnRisk


In [None]:
lr = train_logistic_regression(split.X_train, split.y_train, class_weight=None)

lr_bal = train_logistic_regression(
    split.X_train, split.y_train, class_weight="balanced"
)


rf = train_random_forest(split.X_train, split.y_train, class_weight=None)
rf_bal = train_random_forest(split.X_train, split.y_train, class_weight="balanced")

hgb = train_hgb(split.X_train, split.y_train)

models = {
    "logistic_regression": lr,
    "logistic_regression_balanced": lr_bal,
    "random_forest": rf,
    "random_forest_balanced": rf_bal,
    "hist_gradient_boosting": hgb,
}

In [None]:
y_pred_proba = lr_bal.predict_proba(split.X_test)[:, 1]

In [None]:
# Evaluate on held-out test set (Lift@10% and Lift@20%)
rows = []
for name, m in models.items():
    m10 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.10)
    m20 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.20)
    rows.append({
        "model": name,
        "baseline": m10["baseline"],
        "lift@10%": m10["lift_k"],
        "precision@10%": m10["precision_k"],
        "recall@10%": m10["recall_k"],
        "lift@20%": m20["lift_k"],
        "precision@20%": m20["precision_k"],
        "recall@20%": m20["recall_k"],
        "pr_auc": m10["pr_auc"],
    })


results = pd.DataFrame(rows).sort_values(["lift@10%", "lift@20%", "pr_auc"], ascending=False)
results.to_csv(PROJECT_ROOT / "model_comparison_results.csv", index=False)
results

Unnamed: 0,model,baseline,lift@10%,precision@10%,recall@10%,lift@20%,precision@20%,recall@20%,pr_auc
4,hist_gradient_boosting,0.265436,2.885653,0.765957,0.28877,2.444789,0.648936,0.489305,0.640127
1,logistic_regression_balanced,0.265436,2.858934,0.758865,0.286096,2.565024,0.680851,0.513369,0.654469
0,logistic_regression,0.265436,2.832215,0.751773,0.283422,2.551665,0.677305,0.510695,0.652185
3,random_forest_balanced,0.265436,2.832215,0.751773,0.283422,2.377991,0.631206,0.475936,0.639781
2,random_forest,0.265436,2.778777,0.737589,0.278075,2.484867,0.659574,0.497326,0.638889


In [None]:
# Save the best model by Lift@10%
first_name = results.iloc[0]["model"]
first_model = models[first_name]
out_path = save_model(first_model, first_name)
print(f"✅ Saved best model: {first_name} -> {out_path}")

✅ Saved best model: hist_gradient_boosting -> C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet\data\models\hist_gradient_boosting.pkl


In [None]:
# 1) Required columns
score_cols = ["lift@10%", "lift@20%", "pr_auc"]
missing = [c for c in score_cols if c not in results.columns]
if missing:
    raise ValueError(
        f"results is missing required columns: {missing}. Available: {list(results.columns)}"
    )

# 2) Min-max normalize each metric (0..1), higher is better
results_scored = results.copy()

for c in score_cols:
    cmin, cmax = results_scored[c].min(), results_scored[c].max()
    if cmax == cmin:
        results_scored[c + "_norm"] = 0.0
    else:
        results_scored[c + "_norm"] = (results_scored[c] - cmin) / (cmax - cmin)

# 3) Weighted score
w10, w20, wpr = 0.60, 0.30, 0.10
results_scored["weighted_score"] = (
    w10 * results_scored["lift@10%_norm"]
    + w20 * results_scored["lift@20%_norm"]
    + wpr * results_scored["pr_auc_norm"]
)

# 4) Sort and save top 3
results_sorted = results_scored.sort_values(
    "weighted_score", ascending=False
).reset_index(drop=True)

top3 = results_sorted.loc[
    :2, ["model", "weighted_score", "lift@10%", "lift@20%", "pr_auc"]
]
print("Top 3 by weighted score:\n", top3)

best_name = results_sorted.loc[0, "model"]
out_path = save_model(models[best_name], best_name)
print(f"✅ Saved Champion: {best_name} -> {out_path}")

best_name_2 = results_sorted.loc[1, "model"]
out_path_2 = save_model(models[best_name_2], best_name_2)
print(f"✅ Saved Challenger: {best_name_2} -> {out_path_2}")

best_name_3 = results_sorted.loc[2, "model"]
out_path_3 = save_model(models[best_name_3], best_name_3)
print(f"✅ Saved 3rd: {best_name_3} -> {out_path_3}")

Top 3 by weighted score:
                           model  weighted_score  lift@10%  lift@20%    pr_auc
0  logistic_regression_balanced        0.850000  2.858934  2.565024  0.654469
1        hist_gradient_boosting        0.715092  2.885653  2.444789  0.640127
2           logistic_regression        0.663915  2.832215  2.551665  0.652185
✅ Saved Champion: logistic_regression_balanced -> C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet\data\models\logistic_regression_balanced.pkl
✅ Saved Challenger: hist_gradient_boosting -> C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet\data\models\hist_gradient_boosting.pkl
✅ Saved 3rd: logistic_regression -> C:\Users\User\Predicting-Customer-Churn-in-Telecom_worksheet\data\models\logistic_regression.pkl
