# Customer Churn Analysis - Part 4: Business Evaluation
 
**Project**: Customer Churn Prediction  
**Notebook**: 04 - Business Impact & Recommendations  
**Author**: Yunjae Jung  
**Date**: January 2026
 
## Objectives
- Load best model and evaluate thoroughly
- Perform error analysis
- Calculate business impact & ROI
- Create customer risk segments
- Generate actionable recommendations
- Prepare deployment strategy

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix
)

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì Libraries imported")

‚úì Libraries imported


## 2. Load Model and Data

In [None]:
# Load best model
MODELS_DIR = Path('../models')
best_model = joblib.load(MODELS_DIR / 'best_model.pkl')
scaler = joblib.load(MODELS_DIR / 'scaler.pkl')

print(f"‚úì Best model loaded: {type(best_model).__name__}")

# Load processed data
DATA_DIR = Path('../data/processed')
df = pd.read_csv(DATA_DIR / 'data_processed_final.csv')

# Split
from sklearn.model_selection import train_test_split
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"‚úì Data loaded: {X_test.shape[0]} test samples")


## 3. Model Performance Overview

In [None]:
# Make predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("=" * 80)
print("FINAL MODEL PERFORMANCE")
print("=" * 80)
print(f"\nAccuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Precision: {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall:    {recall:.4f} ({recall*100:.2f}%)")
print(f"F1-Score:  {f1:.4f}")
print(f"ROC-AUC:   {roc_auc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(cm)

# Visualize performance
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Retained', 'Churned'],
            yticklabels=['Retained', 'Churned'])
axes[0].set_title('Confusion Matrix', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

# Metrics bar chart
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Score': [accuracy, precision, recall, f1, roc_auc]
})
sns.barplot(data=metrics_df, x='Metric', y='Score', palette='viridis', ax=axes[1])
axes[1].set_title('Performance Metrics', fontsize=12, fontweight='bold')
axes[1].set_ylim([0, 1])
axes[1].axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='Target: 0.8')
for container in axes[1].containers:
    axes[1].bar_label(container, fmt='%.3f')

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
axes[2].plot(fpr, tpr, linewidth=2, label=f'Model (AUC={roc_auc:.3f})')
axes[2].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random')
axes[2].set_xlabel('False Positive Rate')
axes[2].set_ylabel('True Positive Rate')
axes[2].set_title('ROC Curve', fontsize=12, fontweight='bold')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Error Analysis

### 4.1 Identify Prediction Errors

In [None]:
# Create results DataFrame
results_df = pd.DataFrame({
    'actual': y_test.values,
    'predicted': y_pred,
    'probability': y_pred_proba
})

# Identify error types
results_df['prediction_type'] = 'Correct'
results_df.loc[(results_df['actual'] == 0) & (results_df['predicted'] == 1), 'prediction_type'] = 'False Positive'
results_df.loc[(results_df['actual'] == 1) & (results_df['predicted'] == 0), 'prediction_type'] = 'False Negative'

print("Prediction Type Distribution:")
print("=" * 50)
print(results_df['prediction_type'].value_counts())
print(f"\nError Rate: {(results_df['prediction_type'] != 'Correct').mean()*100:.2f}%")

# Visualize prediction distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Prediction type counts
results_df['prediction_type'].value_counts().plot(kind='bar', ax=axes[0], 
                                                  color=['#4ecdc4', '#ffe66d', '#ff6b6b'])
axes[0].set_title('Prediction Types', fontsize=12, fontweight='bold')
axes[0].set_xlabel('')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')

# Probability distribution by actual class
results_df.boxplot(column='probability', by='actual', ax=axes[1])
axes[1].set_title('Churn Probability by Actual Class')
axes[1].set_xlabel('Actual Churn')
axes[1].set_ylabel('Predicted Probability')
axes[1].set_xticklabels(['Retained (0)', 'Churned (1)'])
plt.suptitle('')

plt.tight_layout()
plt.show()

### 4.2 Analyze False Negatives (Missed Churners)

In [None]:
# False negatives are the most costly - customers we thought would stay but churned
false_negatives = results_df[results_df['prediction_type'] == 'False Negative']

print(f"\nFalse Negatives Analysis:")
print("=" * 50)
print(f"Count: {len(false_negatives)}")
print(f"Percentage of actual churners: {len(false_negatives)/cm[1].sum()*100:.2f}%")
print(f"\nProbability Statistics:")
print(false_negatives['probability'].describe())

# Visualize false negative probabilities
fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(false_negatives['probability'], bins=20, color='#ff6b6b', edgecolor='black', alpha=0.7)
ax.set_title('Churn Probability Distribution - False Negatives', fontsize=12, fontweight='bold')
ax.set_xlabel('Predicted Churn Probability')
ax.set_ylabel('Count')
ax.axvline(x=0.5, color='k', linestyle='--', label='Classification Threshold')
ax.legend()
plt.tight_layout()
plt.show()

print("\nüí° INSIGHT: False negatives had low predicted probabilities")
print("   ‚Üí These customers showed weak churn signals")
print("   ‚Üí Consider lowering threshold or additional monitoring")


### 4.3 Analyze False Positives (False Alarms)

In [None]:
# False positives - predicted churn but actually retained
false_positives = results_df[results_df['prediction_type'] == 'False Positive']

print(f"\nFalse Positives Analysis:")
print("=" * 50)
print(f"Count: {len(false_positives)}")
print(f"Percentage of actual retained: {len(false_positives)/cm[0].sum()*100:.2f}%")
print(f"\nProbability Statistics:")
print(false_positives['probability'].describe())

print("\nüí° INSIGHT: False positives are less costly than false negatives")
print("   ‚Üí Retention campaign on loyal customers may still increase satisfaction")
print("   ‚Üí Cost of campaign << cost of losing actual churner")


## 5. Customer Risk Segmentation

In [None]:
# Create risk segments based on predicted probability
def assign_risk_segment(prob):
    if prob >= 0.7:
        return 'High Risk'
    elif prob >= 0.4:
        return 'Medium Risk'
    else:
        return 'Low Risk'

results_df['risk_segment'] = results_df['probability'].apply(assign_risk_segment)

segment_summary = results_df.groupby('risk_segment').agg({
    'actual': ['count', 'sum', 'mean']
}).round(3)

print("\nRisk Segment Analysis:")
print("=" * 60)
print("Segment Distribution:")
print(results_df['risk_segment'].value_counts().sort_index())

print("\nActual Churn Rate by Risk Segment:")
for segment in ['High Risk', 'Medium Risk', 'Low Risk']:
    segment_data = results_df[results_df['risk_segment'] == segment]
    actual_churn_rate = segment_data['actual'].mean() * 100
    print(f"  {segment:15s}: {actual_churn_rate:5.2f}%")

# Visualize segments
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Segment distribution
segment_counts = results_df['risk_segment'].value_counts()
colors = ['#ff6b6b', '#ffe66d', '#4ecdc4']
axes[0].pie(segment_counts, labels=segment_counts.index, autopct='%1.1f%%',
            colors=colors, startangle=90)
axes[0].set_title('Customer Distribution by Risk', fontsize=12, fontweight='bold')

# Actual churn rate by segment
segment_churn = results_df.groupby('risk_segment')['actual'].mean() * 100
segment_churn.plot(kind='bar', ax=axes[1], color=colors)
axes[1].set_title('Actual Churn Rate by Risk Segment', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Churn Rate (%)')
axes[1].set_xlabel('')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
for container in axes[1].containers:
    axes[1].bar_label(container, fmt='%.1f%%')

# Probability distribution by segment
for segment, color in zip(['High Risk', 'Medium Risk', 'Low Risk'], colors):
    segment_data = results_df[results_df['risk_segment'] == segment]
    axes[2].hist(segment_data['probability'], bins=20, alpha=0.6, 
                label=segment, color=color)
axes[2].set_title('Probability Distribution by Segment', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Churn Probability')
axes[2].set_ylabel('Count')
axes[2].legend()

plt.tight_layout()
plt.show()

## 6. Business Impact & ROI Analysis

### 6.1 Define Business Parameters

In [None]:
# Business assumptions
AVG_CUSTOMER_LTV = 2000  # Average customer lifetime value
RETENTION_CAMPAIGN_COST = 50  # Cost per customer contacted
EXPECTED_SAVE_RATE = 0.50  # 50% of contacted at-risk customers can be saved

print("Business Assumptions:")
print("=" * 50)
print(f"Average Customer LTV: ${AVG_CUSTOMER_LTV:,}")
print(f"Retention Campaign Cost: ${RETENTION_CAMPAIGN_COST}/customer")
print(f"Campaign Success Rate: {EXPECTED_SAVE_RATE:.0%}")


### 6.2 Calculate Financial Impact

In [None]:
# Calculate impact
total_at_risk = (y_test == 1).sum()
identified_at_risk = (y_pred == 1).sum()
true_positives = cm[1, 1]
false_positives = cm[0, 1]

customers_saved = int(true_positives * EXPECTED_SAVE_RATE)
campaign_cost = identified_at_risk * RETENTION_CAMPAIGN_COST
revenue_saved = customers_saved * AVG_CUSTOMER_LTV
wasted_cost = false_positives * RETENTION_CAMPAIGN_COST
net_benefit = revenue_saved - campaign_cost
roi_percent = (net_benefit / campaign_cost) * 100 if campaign_cost > 0 else 0

print("\n" + "=" * 80)
print("FINANCIAL IMPACT ANALYSIS")
print("=" * 80)

print(f"\nCustomer Metrics:")
print(f"  Total at-risk (actual): {total_at_risk}")
print(f"  Model identified: {identified_at_risk}")
print(f"  Correctly identified (TP): {true_positives}")
print(f"  False alarms (FP): {false_positives}")

print(f"\nExpected Outcomes:")
print(f"  Customers saved: {customers_saved}")
print(f"  Save rate: {customers_saved/true_positives*100:.1f}% of identified churners")

print(f"\nüí∞ Financial Impact:")
print(f"  Campaign Cost: ${campaign_cost:,}")
print(f"    ‚Ä¢ Targeting: {identified_at_risk} customers @ ${RETENTION_CAMPAIGN_COST} each")
print(f"  Revenue Saved: ${revenue_saved:,}")
print(f"    ‚Ä¢ {customers_saved} customers @ ${AVG_CUSTOMER_LTV:,} LTV each")
print(f"  Wasted on FP: ${wasted_cost:,}")
print(f"  Net Benefit: ${net_benefit:,}")
print(f"  ROI: {roi_percent:.0f}%")

# Visualize financial impact
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cost-Benefit Analysis
categories = ['Campaign\nCost', 'Revenue\nSaved', 'Net\nBenefit']
values = [campaign_cost, revenue_saved, net_benefit]
colors = ['#ff6b6b', '#4ecdc4', '#95e1d3']

bars = axes[0].bar(categories, values, color=colors)
axes[0].set_title('ROI Analysis - Financial Impact', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Amount ($)')

for bar in bars:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                f'${height:,.0f}',
                ha='center', va='bottom', fontweight='bold')

# Customer flow (Sankey-style bar)
customer_metrics = [total_at_risk, identified_at_risk, true_positives, customers_saved]
labels = ['At Risk\n(Actual)', 'Identified\n(Predicted)', 'Correctly\nIdentified', 'Expected\nSaved']
x_pos = range(len(labels))

axes[1].bar(x_pos, customer_metrics, color='#3498db')
axes[1].set_title('Customer Retention Funnel', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Number of Customers')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(labels)

for i, v in enumerate(customer_metrics):
    axes[1].text(i, v + 10, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

### 6.3 Scenario Analysis

In [None]:
# Calculate ROI for different scenarios
print("\nScenario Analysis - ROI by Success Rate:")
print("=" * 60)

scenarios = [0.3, 0.4, 0.5, 0.6, 0.7]
scenario_results = []

for save_rate in scenarios:
    saved = int(true_positives * save_rate)
    revenue = saved * AVG_CUSTOMER_LTV
    net = revenue - campaign_cost
    roi = (net / campaign_cost) * 100 if campaign_cost > 0 else 0
    
    scenario_results.append({
        'Success Rate': f"{save_rate:.0%}",
        'Customers Saved': saved,
        'Revenue Saved': f"${revenue:,}",
        'Net Benefit': f"${net:,}",
        'ROI': f"{roi:.0f}%"
    })
    
scenario_df = pd.DataFrame(scenario_results)
print(scenario_df.to_string(index=False))

## 7. Actionable Recommendations

In [None]:
print("\n" + "=" * 80)
print("ACTIONABLE BUSINESS RECOMMENDATIONS")
print("=" * 80)

print("\nüéØ IMMEDIATE ACTIONS (Week 1-2):")
print("\n1. DEPLOY RETENTION CAMPAIGN FOR HIGH-RISK CUSTOMERS")
print(f"   ‚Ä¢ Target: {len(results_df[results_df['risk_segment']=='High Risk'])} high-risk customers")
print(f"   ‚Ä¢ Expected cost: ${len(results_df[results_df['risk_segment']=='High Risk']) * RETENTION_CAMPAIGN_COST:,}")
print("   ‚Ä¢ Actions:")
print("      - Personalized retention offers")
print("      - Customer success manager outreach")
print("      - Contract upgrade incentives")

print("\n2. MONITOR MEDIUM-RISK CUSTOMERS")
print(f"   ‚Ä¢ Target: {len(results_df[results_df['risk_segment']=='Medium Risk'])} medium-risk customers")
print("   ‚Ä¢ Actions:")
print("      - Proactive satisfaction surveys")
print("      - Feature usage analysis")
print("      - Early warning system")

print("\n3. ANALYZE ROOT CAUSES")
print("   ‚Ä¢ Review top churn drivers from feature importance")
print("   ‚Ä¢ Conduct customer interviews with recent churners")
print("   ‚Ä¢ Identify product/service gaps")

print("\nüìã SHORT-TERM IMPROVEMENTS (Month 1-3):")
print("\n1. ENHANCE ONBOARDING")
print("   ‚Ä¢ Focus on first 12 months (highest churn period)")
print("   ‚Ä¢ Implement milestone check-ins")
print("   ‚Ä¢ Improve time-to-value")

print("\n2. PRICING STRATEGY REVIEW")
print("   ‚Ä¢ Churned customers pay higher monthly charges")
print("   ‚Ä¢ Review value proposition")
print("   ‚Ä¢ Consider pricing tiers optimization")

print("\n3. CONTRACT INCENTIVES")
print("   ‚Ä¢ Month-to-month customers churn at 3-4x rate")
print("   ‚Ä¢ Promote annual contracts with benefits")
print("   ‚Ä¢ Early renewal bonuses")

print("\nüöÄ LONG-TERM STRATEGY (Quarter 1-2):")
print("\n1. REAL-TIME CHURN PREDICTION")
print("   ‚Ä¢ Deploy model as API")
print("   ‚Ä¢ Integrate with CRM (Salesforce, HubSpot)")
print("   ‚Ä¢ Automated risk scoring")

print("\n2. CONTINUOUS MONITORING")
print("   ‚Ä¢ Track model performance monthly")
print("   ‚Ä¢ Retrain with new data quarterly")
print("   ‚Ä¢ A/B test retention strategies")

print("\n3. EXPAND ANALYTICS")
print("   ‚Ä¢ Customer lifetime value prediction")
print("   ‚Ä¢ Next-best-action recommendations")
print("   ‚Ä¢ Cohort analysis & trends")

## 8. Deployment Strategy


In [None]:
print("\n" + "=" * 80)
print("DEPLOYMENT STRATEGY")
print("=" * 80)

print("\nüì¶ MODEL DEPLOYMENT:")
print("\n1. PRODUCTION ENVIRONMENT")
print("   ‚Ä¢ Deploy as REST API (Flask/FastAPI)")
print("   ‚Ä¢ Containerize with Docker")
print("   ‚Ä¢ Cloud hosting (AWS/GCP/Azure)")

print("\n2. INTEGRATION POINTS")
print("   ‚Ä¢ CRM system (real-time scoring)")
print("   ‚Ä¢ Marketing automation (campaign triggers)")
print("   ‚Ä¢ Customer success platform (alerts)")

print("\n3. MONITORING & MAINTENANCE")
print("   ‚Ä¢ Track prediction accuracy")
print("   ‚Ä¢ Monitor data drift")
print("   ‚Ä¢ Log all predictions")
print("   ‚Ä¢ Monthly performance reports")

print("\nüìä SCORING PROCESS:")
print("   1. New customer data ‚Üí API")
print("   2. Model predicts churn probability")
print("   3. Assign risk segment (High/Medium/Low)")
print("   4. Trigger appropriate action")
print("   5. Log result for analysis")

print("\n‚ö†Ô∏è RISK MITIGATION:")
print("   ‚Ä¢ Model versioning (rollback capability)")
print("   ‚Ä¢ A/B testing before full rollout")
print("   ‚Ä¢ Human-in-the-loop for high-stakes decisions")
print("   ‚Ä¢ Regular model retraining (quarterly)")

## 9. Example Prediction

In [None]:
print("\n" + "=" * 80)
print("EXAMPLE PREDICTION WORKFLOW")
print("=" * 80)

# Select a sample customer
sample_idx = 0
sample_customer = X_test.iloc[sample_idx:sample_idx+1]
sample_actual = y_test.iloc[sample_idx]
sample_pred = y_pred[sample_idx]
sample_prob = y_pred_proba[sample_idx]
sample_risk = results_df.iloc[sample_idx]['risk_segment']

print(f"\nSample Customer ID: {sample_idx}")
print(f"\nActual Status: {'Churned' if sample_actual == 1 else 'Retained'}")
print(f"Predicted Status: {'Churned' if sample_pred == 1 else 'Retained'}")
print(f"Churn Probability: {sample_prob:.2%}")
print(f"Risk Segment: {sample_risk}")

print(f"\nRecommended Action:")
if sample_risk == 'High Risk':
    print("  üî¥ IMMEDIATE INTERVENTION REQUIRED")
    print("     - Assign dedicated success manager")
    print("     - Offer retention discount/upgrade")
    print("     - Schedule executive call")
elif sample_risk == 'Medium Risk':
    print("  üü° PROACTIVE MONITORING")
    print("     - Send satisfaction survey")
    print("     - Review feature usage")
    print("     - Check for recent support issues")
else:
    print("  üü¢ STANDARD ENGAGEMENT")
    print("     - Continue regular communication")
    print("     - Look for upsell opportunities")

## 10. Save Results & Reports

In [None]:
# Save risk segments
REPORTS_DIR = Path('../reports')
REPORTS_DIR.mkdir(exist_ok=True)

# Save risk segment assignments
risk_segments_path = REPORTS_DIR / 'customer_risk_segments.csv'
results_df.to_csv(risk_segments_path, index=False)
print(f"‚úì Risk segments saved to: {risk_segments_path}")

# Save business impact summary
impact_summary = {
    'Metric': [
        'Total At-Risk Customers',
        'Identified by Model',
        'True Positives',
        'False Positives',
        'Expected Customers Saved',
        'Campaign Cost',
        'Revenue Saved',
        'Net Benefit',
        'ROI Percentage'
    ],
    'Value': [
        total_at_risk,
        identified_at_risk,
        true_positives,
        false_positives,
        customers_saved,
        f"${campaign_cost:,}",
        f"${revenue_saved:,}",
        f"${net_benefit:,}",
        f"{roi_percent:.0f}%"
    ]
}

impact_df = pd.DataFrame(impact_summary)
impact_path = REPORTS_DIR / 'business_impact.csv'
impact_df.to_csv(impact_path, index=False)
print(f"‚úì Business impact saved to: {impact_path}")

## 11. Final Summary

In [None]:
print("\n" + "=" * 80)
print("PROJECT SUMMARY")
print("=" * 80)

print("\n‚úÖ ACHIEVEMENTS:")
print(f"   ‚Ä¢ Built and deployed churn prediction model")
print(f"   ‚Ä¢ Achieved {f1*100:.1f}% F1-score")
print(f"   ‚Ä¢ Can identify {recall*100:.1f}% of churning customers")
print(f"   ‚Ä¢ {precision*100:.1f}% precision minimizes false alarms")
print(f"   ‚Ä¢ Projected annual benefit: ${net_benefit:,}")
print(f"   ‚Ä¢ ROI: {roi_percent:.0f}%")

print(f"\nüìä KEY INSIGHTS:")
print(f"   1. Contract type is strongest churn predictor")
print(f"   2. First 12 months are critical retention window")
print(f"   3. Pricing sensitivity affects churn significantly")
print(f"   4. High-risk customers need immediate intervention")

print(f"\nüéØ NEXT STEPS:")
print(f"   1. Deploy model to production")
print(f"   2. Launch retention campaign for {len(results_df[results_df['risk_segment']=='High Risk'])} high-risk customers")
print(f"   3. Implement monitoring dashboard")
print(f"   4. Schedule quarterly model retraining")
print(f"   5. Measure actual retention impact")

print("\n" + "=" * 80)
print("‚úì Analysis Complete - Ready for Stakeholder Presentation")
print("=" * 80)