In [1]:
import pandas as pd
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix
from scipy.stats import randint
import pickle


In [2]:
# ------------------------------------------------------------------
# 1. Load & prep
# ------------------------------------------------------------------
df = pd.read_csv("tel_churn.csv").drop(columns=["Unnamed: 0"])
X = df.drop("Churn", axis=1)
y = df["Churn"]
feature_names = X.columns

In [3]:
# ------------------------------------------------------------------
# 2. Re-sampling (SMOTE-ENN)
# ------------------------------------------------------------------
X_res, y_res = SMOTEENN(random_state=100).fit_resample(X, y)

In [4]:
# ------------------------------------------------------------------
# 3. Stratified split
# ------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res,
    test_size=0.20,
    stratify=y_res,
    random_state=100
)

In [5]:
# ------------------------------------------------------------------
# 4-A.  Base RF model
# ------------------------------------------------------------------
rf_base = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,          # unlimited
    min_samples_leaf=2,      # finer leaves
    class_weight="balanced_subsample",
    random_state=100,
    n_jobs=-1,
    oob_score=True
)

In [6]:
# ------------------------------------------------------------------
# 4-B.  Quick hyper-tune
# ------------------------------------------------------------------
param_grid = {
    "n_estimators": randint(300, 800),
    "max_depth":    randint(10, 25),      # allow deeper trees
    "min_samples_leaf": randint(1, 6)
}


search = RandomizedSearchCV(
    rf_base, param_grid, n_iter=25,
    scoring="f1", cv=3,
    n_jobs=-1, random_state=100, verbose=0
).fit(X_train, y_train)

rf_best = search.best_estimator_

In [7]:
# ------------------------------------------------------------------
# 4-C.  Probability calibration (isotonic)
# ------------------------------------------------------------------
cal_rf = CalibratedClassifierCV(
    estimator=rf_best,
    method="sigmoid",    # ← change here
    cv=3
).fit(X_train, y_train)

In [8]:
# ------------------------------------------------------------------
# 5.  Evaluation on hold-out test set
# ------------------------------------------------------------------
print(f"\n==  Calibrated Random-Forest  ==")
print(f"Accuracy : {cal_rf.score(X_test, y_test):.4f}")

y_pred = cal_rf.predict(X_test)
print("\nClassification report\n", classification_report(y_test, y_pred))
print("Confusion matrix\n",      confusion_matrix(y_test, y_pred))

# ------------------------------------------------------------------
# 6.  Persist calibrated model + feature order
# ------------------------------------------------------------------
with open("model.sav", "wb") as f:
    pickle.dump((cal_rf, feature_names), f)

print("\n✔  Calibrated model saved to model.sav")


==  Calibrated Random-Forest  ==
Accuracy : 0.9547

Classification report
               precision    recall  f1-score   support

           0       0.96      0.94      0.95       528
           1       0.95      0.97      0.96       643

    accuracy                           0.95      1171
   macro avg       0.96      0.95      0.95      1171
weighted avg       0.95      0.95      0.95      1171

Confusion matrix
 [[496  32]
 [ 21 622]]

✔  Calibrated model saved to model.sav
