In [1]:
import sys
from pathlib import Path
import pandas as pd

# If notebook is in /notebooks, repo root is one level up
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

print("PROJECT_ROOT:", PROJECT_ROOT)
print("src_v2 exists:", (PROJECT_ROOT / "src_v2").exists())


PROJECT_ROOT: C:\Users\Acer\OneDrive\Desktop\New folder\Predicting-Customer-Churn-in-Telecom-_v1
src_v2 exists: False


In [2]:
from src.config import DATA_PROCESSED
from src.model_training import (
    make_split,
    train_logistic_regression,
    train_random_forest,
    train_hgb,
    evaluate_for_lift,
    save_model,
)

In [3]:
df = pd.read_csv(DATA_PROCESSED)
df.shape

(7043, 28)

In [4]:
split = make_split(df, test_size=0.2, random_state=42)
split.X_train.shape, split.X_test.shape

((5634, 26), (1409, 26))

In [5]:
X = split.X_train.copy()

na_counts = X.isna().sum().sort_values(ascending=False)
na_counts = na_counts[na_counts > 0]

print("Columns with NaN:", len(na_counts))
display(na_counts)

# Optional: inspect rows where NaNs exist
display(X[X.isna().any(axis=1)].head(10))


Columns with NaN: 0


Series([], dtype: int64)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,charges_ratio,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,HighSpender,HighChurnRisk


In [6]:
# Train candidates
# lr = train_logistic_regression(split.X_train, split.y_train, class_weight=None)
# lr_bal = train_logistic_regression(split.X_train, split.y_train, class_weight="balanced")
lr = train_logistic_regression(split.X_train, split.y_train, class_weight=None)

lr_bal = train_logistic_regression(
    split.X_train, split.y_train, class_weight="balanced"
)


rf = train_random_forest(split.X_train, split.y_train, class_weight=None)
rf_bal = train_random_forest(split.X_train, split.y_train, class_weight="balanced")

hgb = train_hgb(split.X_train, split.y_train)

models = {
    "logistic_regression": lr,
    "logistic_regression_balanced": lr_bal,
    "random_forest": rf,
    "random_forest_balanced": rf_bal,
    "hist_gradient_boosting": hgb,
}



In [8]:
y_pred_proba = lr_bal.predict_proba(split.X_test)[:, 1]


In [9]:
# Evaluate on held-out test set (Lift@10% and Lift@20%)
rows = []
for name, m in models.items():
    m10 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.10)
    m20 = evaluate_for_lift(m, split.X_test, split.y_test, k_frac=0.20)
    rows.append({
        "model": name,
        "baseline": m10["baseline"],
        "lift@10%": m10["lift_k"],
        "precision@10%": m10["precision_k"],
        "recall@10%": m10["recall_k"],
        "lift@20%": m20["lift_k"],
        "precision@20%": m20["precision_k"],
        "recall@20%": m20["recall_k"],
        "pr_auc": m10["pr_auc"],
    })


results = pd.DataFrame(rows).sort_values(["lift@10%", "lift@20%", "pr_auc"], ascending=False)
results

Unnamed: 0,model,baseline,lift@10%,precision@10%,recall@10%,lift@20%,precision@20%,recall@20%,pr_auc
1,logistic_regression_balanced,0.265436,2.885653,0.765957,0.28877,2.538305,0.673759,0.508021,0.654376
4,hist_gradient_boosting,0.265436,2.885653,0.765957,0.28877,2.444789,0.648936,0.489305,0.640127
0,logistic_regression,0.265436,2.832215,0.751773,0.283422,2.524946,0.670213,0.505348,0.651624
3,random_forest_balanced,0.265436,2.832215,0.751773,0.283422,2.377991,0.631206,0.475936,0.639781
2,random_forest,0.265436,2.778777,0.737589,0.278075,2.484867,0.659574,0.497326,0.638889


In [10]:
# Save the best model by Lift@10%
best_name = results.iloc[0]["model"]
best_model = models[best_name]
out_path = save_model(best_model, best_name)
print(f"✅ Saved best model: {best_name} -> {out_path}")

✅ Saved best model: logistic_regression_balanced -> C:\Users\Acer\OneDrive\Desktop\New folder\Predicting-Customer-Churn-in-Telecom-_v1\data\models\logistic_regression_balanced.pkl


In [11]:
# Save the 2nd best model by Lift@10%
best_name_2 = results.iloc[1]["model"]
best_model_2 = models[best_name_2]
out_path_2 = save_model(best_model_2, best_name_2)
print(f"✅ Saved 2nd best model: {best_name_2} -> {out_path_2}")


✅ Saved 2nd best model: hist_gradient_boosting -> C:\Users\Acer\OneDrive\Desktop\New folder\Predicting-Customer-Churn-in-Telecom-_v1\data\models\hist_gradient_boosting.pkl
