# Customer Churn Analysis - Machine Learning Modeling

This notebook builds and evaluates machine learning models for customer churn prediction.

**Educational Purpose**: This analysis is for learning and demonstration purposes only.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Import custom utilities
import sys
sys.path.append('..')
import utils

## 1. Data Loading and Preparation

In [None]:
# Load cleaned data from EDA notebook
try:
    df = pd.read_csv('../data/cleaned_churn_data.csv')
    print("Loaded cleaned data from EDA notebook")
except FileNotFoundError:
    print("Creating sample data for modeling...")
    # Create sample data if file doesn't exist
    np.random.seed(42)
    n_samples = 2000
    
    data = {
        'customerID': [f'C{i:04d}' for i in range(n_samples)],
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'Partner': np.random.choice(['Yes', 'No'], n_samples),
        'Dependents': np.random.choice(['Yes', 'No'], n_samples, p=[0.3, 0.7]),
        'tenure': np.random.randint(1, 73, n_samples),
        'PhoneService': np.random.choice(['Yes', 'No'], n_samples, p=[0.9, 0.1]),
        'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples, p=[0.4, 0.4, 0.2]),
        'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples, p=[0.5, 0.3, 0.2]),
        'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer', 'Credit card'], n_samples),
        'MonthlyCharges': np.random.uniform(20, 120, n_samples),
        'TotalCharges': np.random.uniform(20, 8000, n_samples),
    }
    
    # Create realistic churn patterns
    churn_prob = (
        (np.array([x == 'Month-to-month' for x in data['Contract']]) * 0.3) +
        (np.array(data['tenure']) < 12) * 0.2 +
        (np.array(data['MonthlyCharges']) > 80) * 0.2 +
        np.random.random(n_samples) * 0.3
    )
    data['Churn'] = (churn_prob > 0.5).astype(int)
    
    df = pd.DataFrame(data)

print(f"Dataset shape: {df.shape}")
print(f"Churn rate: {df['Churn'].mean():.2%}")

In [None]:
# Encode categorical features
df_encoded, encoders = utils.encode_categorical_features(df)

# Prepare features and target
X, y, feature_names = utils.prepare_features(df_encoded)

print(f"Features shape: {X.shape}")
print(f"Target distribution: {y.value_counts()}")
print(f"Feature names: {feature_names}")

## 2. Train-Test Split

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Training churn rate: {y_train.mean():.2%}")
print(f"Test churn rate: {y_test.mean():.2%}")

## 3. Model Training and Evaluation

### 3.1 Logistic Regression

In [None]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# Evaluate
lr_metrics = utils.evaluate_model(lr_model, X_test_scaled, y_test)
print("Logistic Regression Performance:")
for metric, value in lr_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

### 3.2 Random Forest

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate
rf_metrics = utils.evaluate_model(rf_model, X_test, y_test)
print("Random Forest Performance:")
for metric, value in rf_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

### 3.3 XGBoost

In [None]:
# Train XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Evaluate
xgb_metrics = utils.evaluate_model(xgb_model, X_test, y_test)
print("XGBoost Performance:")
for metric, value in xgb_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

## 4. Model Comparison

In [None]:
# Compare all models
results_df = pd.DataFrame({
    'Logistic Regression': lr_metrics,
    'Random Forest': rf_metrics,
    'XGBoost': xgb_metrics
}).T

print("Model Comparison:")
print(results_df.round(4))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Metrics comparison
results_df[['accuracy', 'precision', 'recall', 'f1']].plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance Comparison')
axes[0].set_ylabel('Score')
axes[0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
axes[0].tick_params(axis='x', rotation=45)

# ROC AUC comparison
results_df['roc_auc'].plot(kind='bar', ax=axes[1], color='orange')
axes[1].set_title('ROC AUC Comparison')
axes[1].set_ylabel('ROC AUC Score')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Feature Importance Analysis

In [None]:
# Random Forest Feature Importance
rf_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# XGBoost Feature Importance
xgb_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Random Forest
rf_importance.head(10).plot(x='feature', y='importance', kind='barh', ax=axes[0])
axes[0].set_title('Random Forest - Top 10 Feature Importance')
axes[0].set_xlabel('Importance')

# XGBoost
xgb_importance.head(10).plot(x='feature', y='importance', kind='barh', ax=axes[1])
axes[1].set_title('XGBoost - Top 10 Feature Importance')
axes[1].set_xlabel('Importance')

plt.tight_layout()
plt.show()

print("Top 5 Important Features (Random Forest):")
print(rf_importance.head())

print("\nTop 5 Important Features (XGBoost):")
print(xgb_importance.head())

## 6. ROC Curve Analysis

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

# Logistic Regression ROC
lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_proba)
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_metrics["roc_auc"]:.3f})')

# Random Forest ROC
rf_proba = rf_model.predict_proba(X_test)[:, 1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_proba)
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_metrics["roc_auc"]:.3f})')

# XGBoost ROC
xgb_proba = xgb_model.predict_proba(X_test)[:, 1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_proba)
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_metrics["roc_auc"]:.3f})')

# Random classifier line
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True)
plt.show()

## 7. Confusion Matrix

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

models = [
    ('Logistic Regression', lr_model, X_test_scaled),
    ('Random Forest', rf_model, X_test),
    ('XGBoost', xgb_model, X_test)
]

for i, (name, model, X_test_data) in enumerate(models):
    y_pred = model.predict(X_test_data)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'{name} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')

plt.tight_layout()
plt.show()

## 8. Model Selection and Hyperparameter Tuning

In [None]:
# Select best model based on ROC AUC
best_model_name = results_df['roc_auc'].idxmax()
print(f"Best model based on ROC AUC: {best_model_name}")

# Hyperparameter tuning for Random Forest (if it's the best)
if best_model_name == 'Random Forest':
    print("\nPerforming hyperparameter tuning for Random Forest...")
    
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10]
    }
    
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Evaluate tuned model
    tuned_model = grid_search.best_estimator_
    tuned_metrics = utils.evaluate_model(tuned_model, X_test, y_test)
    
    print("\nTuned Random Forest Performance:")
    for metric, value in tuned_metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")
    
    # Use tuned model as final model
    final_model = tuned_model
else:
    # Use the best model without tuning
    if best_model_name == 'Logistic Regression':
        final_model = lr_model
    elif best_model_name == 'XGBoost':
        final_model = xgb_model
    else:
        final_model = rf_model

## 9. Model Saving

In [None]:
# Save the best model and preprocessing objects
import os
os.makedirs('../models', exist_ok=True)

# Save models
with open('../models/logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

with open('../models/random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('../models/xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Save preprocessing objects
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('../models/encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Save feature names
with open('../models/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

print("Models and preprocessing objects saved successfully!")

## 10. Business Insights and Recommendations

In [None]:
# Generate comprehensive business insights
insights = utils.generate_business_insights(df, final_model, feature_names)

print("BUSINESS INSIGHTS AND RECOMMENDATIONS")
print("=" * 50)

print(f"\n1. MODEL PERFORMANCE:")
print(f"   - Best model: {best_model_name}")
print(f"   - ROC AUC Score: {results_df.loc[best_model_name, 'roc_auc']:.3f}")
print(f"   - Accuracy: {results_df.loc[best_model_name, 'accuracy']:.3f}")

print(f"\n2. KEY CHURN DRIVERS:")
if 'top_churn_drivers' in insights:
    for i, driver in enumerate(insights['top_churn_drivers'], 1):
        print(f"   {i}. {driver}")

print(f"\n3. BUSINESS METRICS:")
print(f"   - Overall churn rate: {insights['overall_churn_rate']:.1%}")
if 'highest_risk_contract' in insights:
    print(f"   - Highest risk segment: {insights['highest_risk_contract']} contracts ({insights['highest_risk_contract_rate']:.1%})")

print(f"\n4. ACTIONABLE RECOMMENDATIONS:")
print(f"   - Implement predictive churn scoring for all customers")
print(f"   - Focus retention campaigns on high-risk segments")
print(f"   - Develop early warning systems for new customers")
print(f"   - Create personalized retention offers based on churn probability")
print(f"   - Monitor and address the top churn drivers identified")

print(f"\n5. NEXT STEPS:")
print(f"   - Deploy model in production environment")
print(f"   - Set up automated model retraining pipeline")
print(f"   - Create customer-facing dashboard for retention teams")
print(f"   - A/B test retention strategies on predicted high-risk customers")