In [1]:
# PIPELINES & MODELS - TELCO CUSTOMER CHURN

import pandas as pd
import numpy as np
import joblib
import os, sys

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.base import clone

sys.path.append(os.path.abspath(".."))
from app.pipelines_transf import DataFramePreparer, ColumnFilter

In [2]:
# LOAD CLEAN DATA
df = pd.read_csv("../cleaned_dataset.csv")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Shape: (7043, 20)
Columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [3]:
# SPLIT FEATURES & TARGET
target = "Churn"
X = df.drop(columns=[target])
y = df[target].copy()

In [4]:
# TRAIN/VAL/TEST SPLIT
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

train_set, val_set, test_set = train_val_test_split(df, stratify=target)

X_train, y_train = train_set.drop(columns=[target]), train_set[target]
X_val, y_val = val_set.drop(columns=[target]), val_set[target]
X_test, y_test = test_set.drop(columns=[target]), test_set[target]

In [5]:
# FIT PIPELINE (PREPROCESSING)
prep = DataFramePreparer()
prep.fit(X_train)

X_train_prep = prep.transform(X_train)
X_val_prep = prep.transform(X_val)
X_test_prep = prep.transform(X_test)

print("Original shape:", X_train.shape)
print("Transformed shape:", X_train_prep.shape)

Original shape: (4225, 19)
Transformed shape: (4225, 46)


In [6]:
# FEATURE SELECTION (TOP FEATURES)
selector = SelectKBest(score_func=mutual_info_classif, k=15)
selector.fit(X_train_prep, y_train.map({"No": 0, "Yes": 1}))

top_indices = selector.get_support(indices=True)
feature_names = prep._columns
top_features = [feature_names[i] for i in top_indices]

print("Top features:", top_features)

Top features: ['tenure', 'MonthlyCharges', 'TotalCharges', 'InternetService_Fiber optic', 'OnlineSecurity_No', 'OnlineSecurity_No internet service', 'OnlineBackup_No', 'OnlineBackup_No internet service', 'DeviceProtection_No internet service', 'TechSupport_No', 'TechSupport_No internet service', 'Contract_Month-to-month', 'Contract_Two year', 'PaperlessBilling_No', 'PaymentMethod_Electronic check']


In [7]:
# MODELS DEFINED
models = {
    "bagging": RandomForestClassifier(
        n_estimators=200, max_depth=10, min_samples_split=3, random_state=42, n_jobs=-1
    ),
    "boosting": CatBoostClassifier(
        iterations=400, learning_rate=0.05, depth=8, l2_leaf_reg=5,
        random_seed=42, verbose=False, loss_function="Logloss"
    ),
    "stacking": StackingClassifier(
        estimators=[
            ("rf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
            ("gb", GradientBoostingClassifier(random_state=42))
        ],
        final_estimator=LogisticRegression(max_iter=5000, random_state=42),
        n_jobs=-1
    ),
    "voting": VotingClassifier(
        estimators=[
            ("rf", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
            ("gb", GradientBoostingClassifier(random_state=42))
        ],
        voting="soft", n_jobs=-1
    ),
    "decisiontree": DecisionTreeClassifier(max_depth=7, random_state=42)
}

In [8]:
# TRAINING FUNCTION & EVALUATION
def train_and_evaluate(model, X_tr, y_tr, X_va, y_va):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)
    y_proba = model.predict_proba(X_va)[:, 1] if hasattr(model, "predict_proba") else None

    metrics = {
        "accuracy": accuracy_score(y_va, y_pred),
        "f1": f1_score(y_va, y_pred, pos_label="Yes"),
        "precision": precision_score(y_va, y_pred, pos_label="Yes"),
        "recall": recall_score(y_va, y_pred, pos_label="Yes"),
        "auc": roc_auc_score(y_va.map({"No": 0, "Yes": 1}), y_proba) if y_proba is not None else None
    }
    cm = confusion_matrix(y_va, y_pred)
    return model, metrics, cm

In [9]:
# EXPORT TRAINED MODELS (ALL & TOP FEATURES)
export_data = []
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

RAW_FEATURES = list(X_train.columns)
all_feature_names = prep._columns

for name, model in models.items():
    print(f"\nTraining {name.upper()} (ALL FEATURES)...")

    # ==================== ALL FEATURES ====================
    model_all = clone(model)
    model_all.fit(X_train_prep, y_train)

    y_pred_all = model_all.predict(X_val_prep)
    y_proba_all = model_all.predict_proba(X_val_prep)[:, 1] if hasattr(model_all, "predict_proba") else None

    met_all = {
        "accuracy": accuracy_score(y_val, y_pred_all),
        "f1": f1_score(y_val, y_pred_all, pos_label="Yes"),
        "precision": precision_score(y_val, y_pred_all, pos_label="Yes"),
        "recall": recall_score(y_val, y_pred_all, pos_label="Yes"),
        "auc": roc_auc_score(y_val.map({"No": 0, "Yes": 1}), y_proba_all) if y_proba_all is not None else None
    }
    cm_all = confusion_matrix(y_val, y_pred_all)

    # ==================== TOP FEATURES ====================
    print(f"Training {name.upper()} (TOP FEATURES)...")

    model_top = clone(model)
    top_indices = [all_feature_names.index(f) for f in top_features]
    X_train_top = X_train_prep[:, top_indices]
    X_val_top = X_val_prep[:, top_indices]

    model_top.fit(X_train_top, y_train)

    y_pred_top = model_top.predict(X_val_top)
    y_proba_top = model_top.predict_proba(X_val_top)[:, 1] if hasattr(model_top, "predict_proba") else None

    met_top = {
        "accuracy": accuracy_score(y_val, y_pred_top),
        "f1": f1_score(y_val, y_pred_top, pos_label="Yes"),
        "precision": precision_score(y_val, y_pred_top, pos_label="Yes"),
        "recall": recall_score(y_val, y_pred_top, pos_label="Yes"),
        "auc": roc_auc_score(y_val.map({"No": 0, "Yes": 1}), y_proba_top) if y_proba_top is not None else None
    }
    cm_top = confusion_matrix(y_val, y_pred_top)

    for version, m, met, cm, feats in [
        ("all", model_all, met_all, cm_all, all_feature_names),
        ("top", model_top, met_top, cm_top, top_features)
    ]:
        if version == "all":
            pipe = SKPipeline([
                ("preprocessing", prep),
                ("model", m)
            ])
        else:
            pipe = SKPipeline([
                ("preprocessing", prep),
                ("select", ColumnFilter(columns=list(feats))),
                ("model", m)
            ])

        bundle = {
            "pipeline": pipe,
            "raw_features": RAW_FEATURES,
            "selected_features": feats if version == "top" else None,
            "target_name": target,
            "metrics_val": met,
            "confusion_val": cm.tolist(),
            "feature_importances": (
                m.feature_importances_.tolist() if hasattr(m, "feature_importances_") else []
            ),
        }

        path = os.path.join(models_dir, f"{name}_{version}.pkl")
        joblib.dump(bundle, path)
        print(f"Saved: {path}")

        export_data.append({"Model": name, "Version": version, **met})


Training BAGGING (ALL FEATURES)...
Training BAGGING (TOP FEATURES)...
Saved: ../models\bagging_all.pkl
Saved: ../models\bagging_top.pkl

Training BOOSTING (ALL FEATURES)...
Training BOOSTING (TOP FEATURES)...
Saved: ../models\boosting_all.pkl
Saved: ../models\boosting_top.pkl

Training STACKING (ALL FEATURES)...
Training STACKING (TOP FEATURES)...
Saved: ../models\stacking_all.pkl
Saved: ../models\stacking_top.pkl

Training VOTING (ALL FEATURES)...
Training VOTING (TOP FEATURES)...
Saved: ../models\voting_all.pkl
Saved: ../models\voting_top.pkl

Training DECISIONTREE (ALL FEATURES)...
Training DECISIONTREE (TOP FEATURES)...
Saved: ../models\decisiontree_all.pkl
Saved: ../models\decisiontree_top.pkl


In [10]:
# SUMMARY OF RESULTS
results = pd.DataFrame(export_data)
print("\nSummary of validation metrics:")
display(results)

results.to_csv(os.path.join(models_dir, "model_metrics_summary.csv"), index=False)
print(f"Metrics saved to {models_dir}/model_metrics_summary.csv")


Summary of validation metrics:


Unnamed: 0,Model,Version,accuracy,f1,precision,recall,auc
0,bagging,all,0.804116,0.576687,0.676259,0.502674,0.852467
1,bagging,top,0.809084,0.588055,0.688172,0.513369,0.850745
2,boosting,all,0.801278,0.578313,0.662069,0.513369,0.838264
3,boosting,top,0.808375,0.597015,0.675676,0.534759,0.845688
4,stacking,all,0.809794,0.574603,0.707031,0.483957,0.85321
5,stacking,top,0.808375,0.574132,0.7,0.486631,0.852817
6,voting,all,0.804826,0.573643,0.682657,0.494652,0.849237
7,voting,top,0.807665,0.584992,0.684588,0.510695,0.847601
8,decisiontree,all,0.776437,0.574899,0.580381,0.569519,0.817695
9,decisiontree,top,0.789212,0.618742,0.595062,0.644385,0.817553


Metrics saved to ../models/model_metrics_summary.csv
