In [54]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, roc_curve
)

import joblib
import matplotlib.pyplot as plt
import seaborn as sns


In [55]:
X_train = np.load("../data/processed/X_train.npy")
X_val   = np.load("../data/processed/X_val.npy")
X_test  = np.load("../data/processed/X_test.npy")

y_train = np.load("../data/processed/y_train.npy")
y_val   = np.load("../data/processed/y_val.npy")
y_test  = np.load("../data/processed/y_test.npy")

print(X_train.shape, X_val.shape, X_test.shape)
print("Churn rate:", y_train.mean())


(2256, 18) (483, 18) (484, 18)
Churn rate: 0.36436170212765956


In [56]:
imputer = SimpleImputer(strategy="median")

X_train = imputer.fit_transform(X_train)
X_val   = imputer.transform(X_val)
X_test  = imputer.transform(X_test)

print("✅ NaNs handled")


✅ NaNs handled


In [57]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    return {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred),
        "recall": recall_score(y, y_pred),
        "f1_score": f1_score(y, y_pred),
        "roc_auc": roc_auc_score(y, y_prob)
    }


In [58]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    random_state=42
)

log_reg.fit(X_train, y_train)

baseline_metrics = evaluate_model(log_reg, X_val, y_val)
baseline_metrics


{'accuracy': 0.6211180124223602,
 'precision': 0.4876325088339223,
 'recall': 0.7840909090909091,
 'f1_score': 0.6013071895424836,
 'roc_auc': 0.7271246668640804}

In [59]:
dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_leaf=20,
    class_weight="balanced",
    random_state=42
)

dt.fit(X_train, y_train)

dt_metrics = evaluate_model(dt, X_val, y_val)
dt_metrics


{'accuracy': 0.650103519668737,
 'precision': 0.5165876777251185,
 'recall': 0.6193181818181818,
 'f1_score': 0.5633074935400517,
 'roc_auc': 0.702130219129405}

In [60]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_metrics = evaluate_model(rf, X_val, y_val)
rf_metrics


{'accuracy': 0.6832298136645962,
 'precision': 0.5511111111111111,
 'recall': 0.7045454545454546,
 'f1_score': 0.6184538653366584,
 'roc_auc': 0.7390065146579805}

In [61]:
gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

gb_metrics = evaluate_model(gb, X_val, y_val)
gb_metrics


{'accuracy': 0.6915113871635611,
 'precision': 0.5859872611464968,
 'recall': 0.5227272727272727,
 'f1_score': 0.5525525525525525,
 'roc_auc': 0.7338336541308854}

In [62]:
nn = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=500,
    random_state=42
)

nn.fit(X_train, y_train)

nn_metrics = evaluate_model(nn, X_val, y_val)
nn_metrics


{'accuracy': 0.6873706004140787,
 'precision': 0.5683060109289617,
 'recall': 0.5909090909090909,
 'f1_score': 0.5793871866295265,
 'roc_auc': 0.7036941071957359}

In [63]:
model_comparison = pd.DataFrame([
    {"model": "Logistic Regression", **baseline_metrics},
    {"model": "Decision Tree", **dt_metrics},
    {"model": "Random Forest", **rf_metrics},
    {"model": "Gradient Boosting", **gb_metrics},
    {"model": "Neural Network", **nn_metrics}
])

model_comparison.sort_values("roc_auc", ascending=False)


Unnamed: 0,model,accuracy,precision,recall,f1_score,roc_auc
2,Random Forest,0.68323,0.551111,0.704545,0.618454,0.739007
3,Gradient Boosting,0.691511,0.585987,0.522727,0.552553,0.733834
0,Logistic Regression,0.621118,0.487633,0.784091,0.601307,0.727125
4,Neural Network,0.687371,0.568306,0.590909,0.579387,0.703694
1,Decision Tree,0.650104,0.516588,0.619318,0.563307,0.70213


In [64]:
model_comparison.to_csv("../data/processed/model_comparison.csv", index=False)

joblib.dump(gb, "../models/best_model.pkl")
joblib.dump(imputer, "../models/imputer.pkl")

print("✅ Models and comparison saved")


✅ Models and comparison saved
