In [3]:
# PIPELINES & MODELS - TELCO CUSTOMER CHURN
# Este notebook entrena, evalúa y exporta tres modelos:
# Logistic Regression, Random Forest y Decision Tree
# Cada uno en dos versiones: All features / Top features

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import sys, os
sys.path.append(os.path.abspath(".."))
from app.pipelines_transf import DataFramePreparer

In [None]:
# LOAD CLEAN DATA
df = pd.read_csv("../cleaned_dataset.csv")
print("Shape:", df.shape)

Shape: (7043, 20)


In [7]:
target = "Churn"
X = df.drop(columns=[target])
y = df[target].copy()

In [None]:
# TRAIN/VAL/TEST SPLIT
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

train_set, val_set, test_set = train_val_test_split(df, stratify=target)

X_train, y_train = train_set.drop(columns=[target]), train_set[target]
X_val, y_val = val_set.drop(columns=[target]), val_set[target]
X_test, y_test = test_set.drop(columns=[target]), test_set[target]

In [10]:
# FIT PIPELINE (PREPROCESSING)
prep = DataFramePreparer()
prep.fit(X_train)

X_train_prep = prep.transform(X_train)
X_val_prep = prep.transform(X_val)
X_test_prep = prep.transform(X_test)

print("Original shape:", X_train.shape)
print("Transformed shape:", X_train_prep.shape)

Original shape: (4225, 19)
Transformed shape: (4225, 45)


In [11]:
# FEATURE SELECTION (TOP FEATURES)
selector = SelectKBest(score_func=mutual_info_classif, k=10)
selector.fit(X_train_prep, y_train)
top_indices = selector.get_support(indices=True)
top_features = X_train_prep.columns[top_indices]
print("Top features:", list(top_features))

Top features: ['tenure', 'MonthlyCharges', 'InternetService_Fiber optic', 'OnlineSecurity_No', 'OnlineSecurity_No internet service', 'OnlineBackup_No internet service', 'TechSupport_No', 'Contract_Month-to-month', 'Contract_Two year', 'PaymentMethod_Electronic check']


In [12]:
# TRAINING FUNCTIONS & EVALUATION
def train_and_evaluate(model, X_tr, y_tr, X_va, y_va):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_va)
    y_proba = model.predict_proba(X_va)[:, 1] if hasattr(model, "predict_proba") else None
    
    metrics = {
        "accuracy": accuracy_score(y_va, y_pred),
        "f1": f1_score(y_va, y_pred, pos_label="Yes"),
        "precision": precision_score(y_va, y_pred, pos_label="Yes"),
        "recall": recall_score(y_va, y_pred, pos_label="Yes"),
        "auc": roc_auc_score(y_va.map({"No":0, "Yes":1}), y_proba) if y_proba is not None else None
    }
    cm = confusion_matrix(y_va, y_pred)
    return model, metrics, cm

In [None]:
# TRAIN ALL MODELS
models = {
    "logreg": LogisticRegression(max_iter=5000, random_state=42),
    "rf": RandomForestClassifier(n_estimators=200, random_state=42),
    "dt": DecisionTreeClassifier(max_depth=10, min_samples_leaf=5, random_state=42),
}

export_data = {}

for name, model in models.items():
    print(f"\nTraining {name.upper()} (ALL FEATURES)...")
    m_all, met_all, cm_all = train_and_evaluate(model, X_train_prep, y_train, X_val_prep, y_val)

    print(f"{name.upper()} (TOP FEATURES)...")
    m_top, met_top, cm_top = train_and_evaluate(model, X_train_prep[top_features], y_train, X_val_prep[top_features], y_val)

    # Save bundles
    for version, m, met, cm, feats in [
        ("all", m_all, met_all, cm_all, X_train_prep.columns),
        ("top", m_top, met_top, cm_top, top_features)
    ]:
        bundle = {
            "pipeline": prep,
            "model": m,
            "features_in_": list(feats),
            "target_name": target,
            "metrics_val": met,
            "confusion_val": cm.tolist(),
            "feature_importances": (
                m.feature_importances_.tolist() if hasattr(m, "feature_importances_") else []
            ),
        }
        fname = f"../models/{name}_{version}.pkl"
        joblib.dump(bundle, fname)
        export_data.append((name, version, met))
        print(f"Saved: {fname}")

In [None]:
# SUMMARY OF RESULTS
results = pd.DataFrame([
    {"Model": n, "Version": v, **m} for n, v, m in export_data
])
print("\nSummary of validation metrics:")
display(results)

results.to_csv("../models/model_metrics_summary.csv", index=False)
print("Metrics saved to models/model_metrics_summary.csv")