In [1]:
!pip install -q lightgbm shap lime scikit-learn pandas numpy matplotlib seaborn

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.py) ... [?25l[?25hdone


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# -------------------- Imports --------------------
import os, json, warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import lightgbm as lgb
import shap
from lime.lime_tabular import LimeTabularExplainer
from datetime import datetime
np.random.seed(42)

In [4]:
# -------------------- Configuration --------------------
# Primary (uploaded) dataset path
UPLOADED_PATH = "/content/drive/MyDrive/Colab Notebooks/Telco_customer_churn.csv"
ALTERNATE_PATHS = [
    "/content/Telco_customer_churn.csv",
    "/content/drive/MyDrive/Telco_customer_churn.csv",
    "/content/drive/MyDrive/Telco_customer_churn/ Telco_customer_churn.csv"
]
DATA_PATH = None
for p in [UPLOADED_PATH] + ALTERNATE_PATHS:
    if os.path.exists(p):
        DATA_PATH = p
        break

if DATA_PATH is None:
    raise FileNotFoundError("Dataset not found. Upload Telco_customer_churn.csv to Colab or Drive and set DATA_PATH accordingly.")

TARGET_COL = 'Churn'      # Typical Telco churn column name; adjust if your dataset uses 'churn' or 'Churn Label'
OUTPUT_DIR = 'churn_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Using dataset:", DATA_PATH)
print("Output folder:", OUTPUT_DIR)

Using dataset: /content/drive/MyDrive/Colab Notebooks/Telco_customer_churn.csv
Output folder: churn_outputs


In [5]:
# -------------------- Load dataset --------------------
df = pd.read_csv(DATA_PATH)
print("Dataset shape:", df.shape)
# preview columns
print("Columns:", df.columns.tolist()[:40])

# Try to unify typical Telco dataset target naming
if 'Churn' not in df.columns and 'churn' in df.columns:
    TARGET_COL = 'churn'
if 'Churn' not in df.columns and 'Churn Label' in df.columns:
    TARGET_COL = 'Churn Label'
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column not found. Expected 'Churn' (or similar). Found columns: {df.columns.tolist()}")

# Convert target to binary 0/1 if it's Yes/No or strings
if df[TARGET_COL].dtype == 'object':
    df[TARGET_COL] = df[TARGET_COL].str.strip().map(lambda x: 1 if str(x).lower() in ['yes','y','true','1'] else 0)

# Save target distribution
target_counts = df[TARGET_COL].value_counts().to_dict()
with open(os.path.join(OUTPUT_DIR, 'target_distribution.json'), 'w') as f:
    json.dump(target_counts, f, indent=2)
print("Target distribution:", target_counts)

Dataset shape: (7043, 33)
Columns: ['CustomerID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Lat Long', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Tenure Months', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'Monthly Charges', 'Total Charges', 'Churn Label', 'Churn Value', 'Churn Score', 'CLTV', 'Churn Reason']
Target distribution: {0: 5174, 1: 1869}


In [6]:
# -------------------- Basic preprocessing & feature engineering --------------------
# Identify numeric and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
if TARGET_COL in num_cols: num_cols.remove(TARGET_COL)
if TARGET_COL in cat_cols: cat_cols.remove(TARGET_COL)

# Basic cleaning: fill numeric NaNs with median, text with 'missing'
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())
for c in cat_cols:
    df[c] = df[c].fillna('missing')

# Example simple derived features if available (safe checks)
derived_cols = []
if 'tenure' in df.columns and 'MonthlyCharges' in df.columns:
    # normalize naming differences
    if 'MonthlyCharges' in df.columns:
        df['tenure_total_charges_proxy'] = df['tenure'] * df['MonthlyCharges']
        derived_cols.append('tenure_total_charges_proxy')

# Limit cardinality for one-hot: drop very high-cardinality categorical columns
MAX_CARDINALITY = 50
cat_small = [c for c in cat_cols if df[c].nunique() <= MAX_CARDINALITY]
cat_big = [c for c in cat_cols if df[c].nunique() > MAX_CARDINALITY]
if cat_big:
    print("Dropping high-cardinality categorical columns to keep preprocessing simple:", cat_big)

# Build feature matrix with one-hot for small cats
feature_df = pd.get_dummies(df[num_cols + cat_small + derived_cols], drop_first=True)
feature_names = feature_df.columns.tolist()
print("Final feature matrix shape:", feature_df.shape)

Dropping high-cardinality categorical columns to keep preprocessing simple: ['CustomerID', 'City', 'Lat Long', 'Total Charges']
Final feature matrix shape: (7043, 56)


In [7]:
# -------------------- Train-test split --------------------
X = feature_df.copy()
y = df[TARGET_COL].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train/Test shapes:", X_train.shape, X_test.shape)


Train/Test shapes: (5634, 56) (1409, 56)


In [8]:
# -------------------- Pipeline & Randomized Search over LightGBM --------------------
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', lgb.LGBMClassifier(objective='binary', random_state=42, n_jobs=1))
])

param_dist = {
    'clf__num_leaves': [15, 31, 63],
    'clf__n_estimators': [100, 250, 500],
    'clf__learning_rate': [0.01, 0.05, 0.1],
    'clf__min_child_samples': [5, 10, 20]
}

rs = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=8, cv=3, scoring='roc_auc', random_state=42, verbose=1)
print("Starting RandomizedSearchCV...")
rs.fit(X_train, y_train)
print("Best CV AUC:", rs.best_score_)
print("Best params:", rs.best_params_)

best_pipeline = rs.best_estimator_
best_model = best_pipeline.named_steps['clf']

# Save best model params as text (deliverable)
params_text = "Best model parameters (LightGBM):\n" + "\n".join([f"{k}: {v}" for k,v in best_model.get_params().items()])
with open(os.path.join(OUTPUT_DIR, 'best_model_params.txt'), 'w') as f:
    f.write(params_text)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 996, number of negative: 2760
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1564
[LightGBM] [Info] Number of data points in the train set: 3756, number of used features: 51
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265176 -> initscore=-1.019239
[LightGBM] [Info] Start training from score -1.019239
[LightGBM] [Info] Number of positive: 997, number of negative: 2759
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1576
[LightGBM] [Info] Number of data points in the train 

In [9]:
# -------------------- Evaluation on Test set --------------------
X_test_proc = best_pipeline.named_steps['scaler'].transform(X_test)
y_pred = best_pipeline.named_steps['clf'].predict(X_test_proc)
y_prob = best_pipeline.named_steps['clf'].predict_proba(X_test_proc)[:,1]

metrics = {
    'auc': float(roc_auc_score(y_test, y_prob)),
    'f1': float(f1_score(y_test, y_pred)),
    'precision': float(precision_score(y_test, y_pred)),
    'recall': float(recall_score(y_test, y_pred))
}
with open(os.path.join(OUTPUT_DIR, 'metrics.json'), 'w') as f:
    json.dump(metrics, f, indent=2)
print("Test metrics:", metrics)

# Save classification report
cr = classification_report(y_test, y_pred)
with open(os.path.join(OUTPUT_DIR, 'classification_report.txt'), 'w') as f:
    f.write(cr)

# Confusion matrix image (small)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation='nearest')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
for (i, j), val in np.ndenumerate(cm):
    plt.text(j, i, val, ha='center', va='center', color='white')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'), dpi=100)
plt.close()


Test metrics: {'auc': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0}


In [10]:
# -------------------- SHAP: Global & Local explanations --------------------
print("Running SHAP (using small sample to limit memory)...")
# Use TreeExplainer on the LightGBM model directly
explainer = shap.TreeExplainer(best_model)

# SHAP sample from training data (limit to 500 rows to keep small)
shap_sample = X_train.sample(n=min(500, len(X_train)), random_state=42)
shap_vals = explainer.shap_values(shap_sample)

# Global summary plot (save)
plt.figure(figsize=(8,5))
shap.summary_plot(shap_vals, shap_sample, show=False)
plt.title('SHAP summary (sample)')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'shap_summary.png'), bbox_inches='tight', dpi=100)
plt.close()

# Save feature importance (mean absolute SHAP)
mean_abs_shap = np.abs(shap_vals).mean(axis=0)
feat_imp = pd.DataFrame({
    'feature': shap_sample.columns,
    'mean_abs_shap': mean_abs_shap
}).sort_values('mean_abs_shap', ascending=False)
feat_imp.to_csv(os.path.join(OUTPUT_DIR, 'shap_feature_importance.csv'), index=False)

# Short textual explanation of top 10 SHAP features
top10 = feat_imp.head(10)
text_lines = ["Top 10 features by mean |SHAP|:"]
for _, row in top10.iterrows():
    text_lines.append(f"{row['feature']}: {row['mean_abs_shap']:.6f}")
text_lines.append("\nInterpretation: Positive SHAP values increase predicted churn probability; negative decrease. Use domain context to craft retention actions.")
with open(os.path.join(OUTPUT_DIR, 'shap_global_summary.txt'), 'w') as f:
    f.write("\n".join(text_lines))

# Local SHAP explanations for 5 representative test examples (3 churn, 2 non-churn if available)
print("Creating local SHAP summaries for representative examples...")
local_idxs = []
# pick up to 3 churn examples and 2 non-churn examples from the test set
churn_idxs = list(y_test[y_test==1].sample(n=min(3, y_test.sum()), random_state=42).index) if y_test.sum()>0 else []
nonchurn_idxs = list(y_test[y_test==0].sample(n=min(2, (y_test==0).sum()), random_state=42).index)
local_idxs = churn_idxs + nonchurn_idxs

local_summary_list = []
for i, idx in enumerate(local_idxs):
    x_row = X_test.loc[[idx]]
    shap_local = explainer.shap_values(x_row)
    df_local = pd.DataFrame({
        'feature': x_row.columns,
        'value': x_row.values.flatten(),
        'shap_value': shap_local.flatten()
    })
    df_local['abs_shap'] = df_local['shap_value'].abs()
    df_local = df_local.sort_values('abs_shap', ascending=False).head(10)
    fname = os.path.join(OUTPUT_DIR, f'shap_local_top10_idx_{idx}.csv')
    df_local.to_csv(fname, index=False)
    local_summary_list.append({'index': int(idx), 'top_local': df_local[['feature','value','shap_value']].to_dict(orient='records')})

with open(os.path.join(OUTPUT_DIR, 'shap_local_summaries.json'), 'w') as f:
    json.dump(local_summary_list, f, indent=2)

Running SHAP (using small sample to limit memory)...
Creating local SHAP summaries for representative examples...


In [11]:
# -------------------- LIME: local explanations for same examples --------------------
print("Running LIME for selected examples...")
# LIME needs a prediction function that takes numpy array of raw features (not scaled)
def predict_proba_for_lime(x_numpy):
    # x_numpy: rows in original numeric-onehot feature space
    x_scaled = best_pipeline.named_steps['scaler'].transform(x_numpy)
    return best_pipeline.named_steps['clf'].predict_proba(x_scaled)

# Build Lime explainer on training data
explainer_lime = LimeTabularExplainer(training_data=X_train.values,
                                      feature_names=X_train.columns.tolist(),
                                      class_names=['no_churn','churn'],
                                      mode='classification',
                                      discretize_continuous=True)

lime_html_files = []
lime_summaries = []
for i, idx in enumerate(local_idxs):
    row = X_test.loc[idx:idx]
    exp = explainer_lime.explain_instance(row.values.flatten(), predict_proba_for_lime, num_features=10)
    html_path = os.path.join(OUTPUT_DIR, f'lime_explanation_idx_{idx}.html')
    exp.save_to_file(html_path)
    lime_html_files.append(html_path)
    lime_summaries.append({'index': int(idx), 'lime_top': exp.as_list(label=1)})

with open(os.path.join(OUTPUT_DIR, 'lime_local_summaries.json'), 'w') as f:
    json.dump(lime_summaries, f, indent=2)

# Create combined SHAP vs LIME comparison file for these examples
comparisons = []
for s in local_summary_list:
    idx = s['index']
    shap_top = s['top_local']
    lime_top = next((x['lime_top'] for x in lime_summaries if x['index']==idx), [])
    comparisons.append({'index': idx, 'shap_top': shap_top, 'lime_top': lime_top})
with open(os.path.join(OUTPUT_DIR, 'shap_lime_comparisons.json'), 'w') as f:
    json.dump(comparisons, f, indent=2)

Running LIME for selected examples...


In [12]:
# -------------------- Final textual analysis & README for GitHub (deliverables) --------------------
final_lines = []
final_lines.append("Cultus - Interpretable ML: Churn Prediction")
final_lines.append(f"Generated at: {datetime.utcnow().isoformat()} UTC")
final_lines.append("\nMethod summary:")
final_lines.append("- Preprocessing: median imputation for numerics, one-hot for small categorical columns")
final_lines.append("- Model: LightGBM classifier with RandomizedSearchCV (AUC scoring)")
final_lines.append("- Interpretability: SHAP (global + local) and LIME (local) on representative examples")
final_lines.append("\nTest metrics:")
final_lines.append(json.dumps(metrics, indent=2))
final_lines.append("\nTop SHAP features saved in: shap_feature_importance.csv")
final_lines.append("Local SHAP summaries: shap_local_summaries.json")
final_lines.append("Local LIME HTML files: " + ", ".join([os.path.basename(p) for p in lime_html_files]))
final_lines.append("\nNotes for Cultus submission and GitHub:")
final_lines.append("- Include only the notebook (.ipynb) or this script and the files inside churn_outputs/ (text, CSV, PNG, HTML).")
final_lines.append("- Do NOT include model binary files (joblib).")
final_lines.append("- Keep repo size <25MB: the script uses small SHAP samples and limited images to help ensure low size.")
final_lines.append("\nActionable recommendations (business):")
final_lines.append("- Use the top SHAP features to define retention cohorts and personalized incentives.")
final_lines.append("- Use LIME per-customer local outputs to craft specific outreach messages for high-risk customers.")
with open(os.path.join(OUTPUT_DIR, 'final_analysis.txt'), 'w') as f:
    f.write("\n".join(final_lines))

# Save a README and requirements for GitHub
readme = """# Cultus - Interpretable ML: Churn Prediction

This repository contains a Colab-friendly notebook/script to train a LightGBM churn model and produce interpretability deliverables (SHAP & LIME).

**Files to include in GitHub (keep <25MB):**
- churn_interpretability.ipynb (or this script)
- churn_outputs/ (include .txt, .csv, .png, .html but exclude large binaries)
- requirements.txt

How to run:
1. Upload `Telco_customer_churn.csv` to Colab or mount Google Drive and set DATA_PATH accordingly.
2. Run the notebook/script. Outputs will be in `churn_outputs/`.

"""
with open(os.path.join(OUTPUT_DIR, 'README_for_gitingest.txt'), 'w') as f:
    f.write(readme)
with open(os.path.join(OUTPUT_DIR, 'requirements.txt'), 'w') as f:
    f.write("\n".join(["numpy","pandas","scikit-learn","lightgbm","shap","lime","matplotlib"]))


In [13]:
# -------------------- Summary & size check --------------------
print("\nDeliverables created in:", OUTPUT_DIR)
for fn in sorted(os.listdir(OUTPUT_DIR)):
    path = os.path.join(OUTPUT_DIR, fn)
    print(fn, "-", os.path.getsize(path)//1024, "KB")

# Note: If the total size is > 25MB, remove or compress large artifacts (reduce png dpi, reduce SHAP sample)
total_bytes = sum(os.path.getsize(os.path.join(OUTPUT_DIR, f)) for f in os.listdir(OUTPUT_DIR))
print("Total churn_outputs size: {:.2f} MB".format(total_bytes / (1024*1024)))
print("\nDone. Inspect churn_outputs/ and then upload notebook + churn_outputs (text, csv, png, html) to GitHub.")


Deliverables created in: churn_outputs
README_for_gitingest.txt - 0 KB
best_model_params.txt - 0 KB
classification_report.txt - 0 KB
confusion_matrix.png - 16 KB
final_analysis.txt - 1 KB
lime_explanation_idx_3083.html - 1203 KB
lime_explanation_idx_33.html - 1203 KB
lime_explanation_idx_431.html - 1203 KB
lime_explanation_idx_5290.html - 1203 KB
lime_explanation_idx_891.html - 1203 KB
lime_local_summaries.json - 5 KB
metrics.json - 0 KB
requirements.txt - 0 KB
shap_feature_importance.csv - 1 KB
shap_global_summary.txt - 0 KB
shap_lime_comparisons.json - 11 KB
shap_local_summaries.json - 6 KB
shap_local_top10_idx_3083.csv - 0 KB
shap_local_top10_idx_33.csv - 0 KB
shap_local_top10_idx_431.csv - 0 KB
shap_local_top10_idx_5290.csv - 0 KB
shap_local_top10_idx_891.csv - 0 KB
shap_summary.png - 103 KB
target_distribution.json - 0 KB
Total churn_outputs size: 6.03 MB

Done. Inspect churn_outputs/ and then upload notebook + churn_outputs (text, csv, png, html) to GitHub.
