In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib
import shap

In [2]:
data_path = '/content/WA_Fn-UseC_-Telco-Customer-Churn.csv'
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Dataset not found at {data_path}. Please download the Telco churn CSV and place it there.")

df = pd.read_csv(data_path)
print('Loaded dataset with shape:', df.shape)

df.head()

Loaded dataset with shape: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
if 'customerID' in df.columns:
    df = df.drop(columns=['customerID'])

if 'Churn' not in df.columns:
    raise ValueError("No 'Churn' column found in the dataset. Please ensure the target column is named 'Churn' with values 'Yes'/'No' or 1/0.")

if df['Churn'].dtype == object:
    df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

print('Value counts for target:')
print(df['Churn'].value_counts())

df.info()

Value counts for target:
Churn
0    5174
1    1869
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBil

In [5]:
def preprocess_and_split(df, target='Churn', test_size=0.2, random_state=42):
    X = df.drop(columns=[target])
    y = df[target].astype(int)

    num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

    num_pipeline = Pipeline([
        ('impute', SimpleImputer(strategy='median')),
        ('scale', StandardScaler())
    ])
    cat_pipeline = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    return preprocessor, X_train, X_test, y_train, y_test

preprocessor, X_train, X_test, y_train, y_test = preprocess_and_split(df)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)
print('Train target distribution:\n', y_train.value_counts(normalize=True))

Train shape: (5634, 19) Test shape: (1409, 19)
Train target distribution:
 Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64


In [6]:
pipe_lr = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
pipe_lr.fit(X_train, y_train)
probs_lr = pipe_lr.predict_proba(X_test)[:,1]
auc_lr = roc_auc_score(y_test, probs_lr)

pipe_rf = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])
pipe_rf.fit(X_train, y_train)
probs_rf = pipe_rf.predict_proba(X_test)[:,1]
auc_rf = roc_auc_score(y_test, probs_rf)

pipe_xgb = Pipeline([
    ('pre', preprocessor),
    ('clf', xgb.XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='auc', random_state=42))
])
pipe_xgb.fit(X_train, y_train)
probs_xgb = pipe_xgb.predict_proba(X_test)[:,1]
auc_xgb = roc_auc_score(y_test, probs_xgb)

print('AUC scores:')
print(f'  Logistic: {auc_lr:.4f}')
print(f'  RandomForest: {auc_rf:.4f}')
print(f'  XGBoost: {auc_xgb:.4f}')

aucs = {'lr':auc_lr, 'rf':auc_rf, 'xgb':auc_xgb}
best = max(aucs, key=aucs.get)
print('Best model:', best)

best_pipe = {'lr':pipe_lr, 'rf':pipe_rf, 'xgb':pipe_xgb}[best]
preds = (best_pipe.predict_proba(X_test)[:,1] >= 0.5).astype(int)
print('\nClassification report for best model:')
print(classification_report(y_test, preds))
print('Confusion matrix:\n', confusion_matrix(y_test, preds))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC scores:
  Logistic: 0.8403
  RandomForest: 0.8208
  XGBoost: 0.8110
Best model: lr

Classification report for best model:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409

Confusion matrix:
 [[917 118]
 [172 202]]


In [7]:
os.makedirs('models', exist_ok=True)
joblib.dump(pipe_lr, 'models/logistic_pipeline.joblib')
joblib.dump(pipe_rf, 'models/rf_pipeline.joblib')
joblib.dump(pipe_xgb, 'models/xgb_pipeline.joblib')

Saved models to ./models


In [8]:
sample_idx = 0
sample = X_test.iloc[[sample_idx]]
print('Inspecting test sample index', sample_idx)

pre = best_pipe.named_steps['pre']
clf = best_pipe.named_steps['clf']
X_trans = pre.transform(sample)

try:
    if hasattr(clf, 'predict_proba') and best in ['rf','xgb']:
        explainer = shap.TreeExplainer(clf)
        shap_values = explainer.shap_values(X_trans)
        def get_feature_names(column_transformer):
            feature_names = []
            for name, trans, cols in column_transformer.transformers_:
                if name == 'remainder':
                    continue
                if hasattr(trans, 'named_steps') and 'ohe' in trans.named_steps:
                    ohe = trans.named_steps['ohe']
                    cats = ohe.categories_
                    for col, cat in zip(cols, cats):
                        for c in cat:
                            feature_names.append(f"{col}__{c}")
                else:
                    for col in cols:
                        feature_names.append(col)
            return feature_names
        feat_names = get_feature_names(pre)
        row_shap = np.array(shap_values[1] if isinstance(shap_values, list) else shap_values)[0]
        contribs = sorted(zip(feat_names, row_shap), key=lambda x: -abs(x[1]))
        import pandas as pd
        pd.DataFrame(contribs[:20], columns=['feature','shap_value'])
    else:
        print('SHAP TreeExplainer available only for tree models in this demo. Use KernelExplainer for others (slower).')
except Exception as e:
    print('SHAP explanation error:', e)

Inspecting test sample index 0
SHAP TreeExplainer available only for tree models in this demo. Use KernelExplainer for others (slower).
