# Churn Prediction Model
This notebook trains an XGBoost-based churn model on the `churn.csv` dataset, dropping UI-only columns (`customerID`, `variant`) and outputting artifacts for deployment.

In [ ]:
# Step 1: Imports
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


In [ ]:
# Step 2: Load & Clean Data
df = pd.read_csv("../data/churn.csv")  # adjust path as needed
# Drop UI-only columns
df.drop(columns=["customerID", "variant"], errors="ignore", inplace=True)
# Remove blank TotalCharges and convert
mask = df["TotalCharges"].astype(str).str.strip() != ""
df = df.loc[mask].copy()
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(subset=["TotalCharges"], inplace=True)
# Log-transform
df["TotalCharges"] = np.log1p(df["TotalCharges"])


In [ ]:
# Step 3: Encode Target & Categoricals
# Encode Churn as 0/1
df['Churn'] = (df['Churn'] == 'Yes').astype(int)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [ ]:
# Step 4: Split features & target
X = df.drop(columns=['Churn'])
y = df['Churn']


In [ ]:
# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [ ]:
# Step 6: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [ ]:
# Step 7: Train XGBoost with imbalance handling
ratio = (y_train == 0).sum() / (y_train == 1).sum()
model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=ratio,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.3,
    n_estimators=100,
    random_state=42
)
model.fit(X_train_scaled, y_train)


In [ ]:
# Step 8: Evaluate Model
y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:,1]

print("AUC:", roc_auc_score(y_test, y_proba))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [ ]:
# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [ ]:
# Step 10: Save Artifacts
joblib.dump(model, 'xgb_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')
print('Artifacts saved: xgb_model.pkl, scaler.pkl, label_encoders.pkl')
