# Customer Churn Prediction Analysis

## 📊 Business Problem
This notebook analyzes customer churn for a telecommunications company. Our goal is to:
- Understand patterns in customer behavior
- Build predictive models to identify at-risk customers
- Provide actionable insights for customer retention strategies

## 📋 Analysis Outline
1. **Data Loading & Overview**
2. **Exploratory Data Analysis (EDA)**
3. **Data Preprocessing**
4. **Model Training & Evaluation**
5. **Feature Importance Analysis**
6. **Business Insights & Recommendations**

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import xgboost as xgb

# Model persistence
import joblib

# Set random seed for reproducibility
np.random.seed(42)

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ All libraries imported successfully!")
print(f"📁 Current working directory: {os.getcwd()}")

## 2. Data Loading & Overview

First, let's load the dataset and get a basic understanding of our data structure.

In [None]:
# Load the dataset
# Note: Download the Telco Customer Churn dataset and place it in the data/ folder
data_path = '../data/Telco-Customer-Churn.csv'

try:
    df = pd.read_csv(data_path)
    print("✅ Dataset loaded successfully!")
    print(f"📊 Dataset shape: {df.shape}")
except FileNotFoundError:
    print("❌ Dataset not found!")
    print("Please download the Telco Customer Churn dataset from:")
    print("https://www.kaggle.com/datasets/blastchar/telco-customer-churn")
    print("And place it in the ../data/ folder as 'Telco-Customer-Churn.csv'")
    
    # Create sample data for demonstration
    print("\n📝 Creating sample data for demonstration...")
    np.random.seed(42)
    n_samples = 1000
    
    df = pd.DataFrame({
        'customerID': [f'ID_{i:04d}' for i in range(n_samples)],
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'Partner': np.random.choice(['Yes', 'No'], n_samples),
        'Dependents': np.random.choice(['Yes', 'No'], n_samples),
        'tenure': np.random.randint(1, 73, n_samples),
        'PhoneService': np.random.choice(['Yes', 'No'], n_samples, p=[0.9, 0.1]),
        'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples),
        'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples),
        'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], n_samples),
        'MonthlyCharges': np.random.uniform(18, 120, n_samples).round(2),
        'TotalCharges': np.random.uniform(18, 8500, n_samples).round(2),
        'Churn': np.random.choice(['Yes', 'No'], n_samples, p=[0.27, 0.73])
    })
    print(f"📊 Sample dataset created with shape: {df.shape}")

# Display basic information about the dataset
print(f"\n📋 Dataset Info:")
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]:,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# Display first few rows
print("🔍 First 5 rows of the dataset:")
display(df.head())

print("\n📊 Dataset columns and data types:")
display(df.dtypes.to_frame(name='Data Type'))

print("\n📈 Basic statistics:")
display(df.describe())

## 3. Exploratory Data Analysis (EDA)

Let's explore the data to understand patterns, distributions, and relationships between features.

In [None]:
# Check for missing values
print("🔍 Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
}).sort_values('Missing Count', ascending=False)

print(missing_df[missing_df['Missing Count'] > 0])
if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found!")

# Churn distribution analysis
print(f"\n📊 Churn Distribution:")
churn_counts = df['Churn'].value_counts()
churn_percentage = df['Churn'].value_counts(normalize=True) * 100

print(f"Total customers: {len(df):,}")
for category, count in churn_counts.items():
    pct = churn_percentage[category]
    print(f"{category}: {count:,} ({pct:.1f}%)")

# Visualize churn distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Count plot
sns.countplot(data=df, x='Churn', ax=ax1)
ax1.set_title('Customer Churn Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Churn Status')
ax1.set_ylabel('Number of Customers')

# Add count labels on bars
for i, v in enumerate(churn_counts.values):
    ax1.text(i, v + 50, str(v), ha='center', va='bottom', fontweight='bold')

# Pie chart
ax2.pie(churn_counts.values, labels=churn_counts.index, autopct='%1.1f%%', startangle=90)
ax2.set_title('Churn Rate Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../outputs/plots/churn_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Data Preprocessing

Prepare the data for machine learning by encoding categorical variables and scaling numerical features.

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Remove customer ID as it's not useful for prediction
if 'customerID' in df_processed.columns:
    df_processed = df_processed.drop('customerID', axis=1)

# Identify categorical and numerical columns
categorical_columns = df_processed.select_dtypes(include=['object']).columns.tolist()
numerical_columns = df_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable from categorical columns
if 'Churn' in categorical_columns:
    categorical_columns.remove('Churn')

print(f"📊 Feature Analysis:")
print(f"Categorical features ({len(categorical_columns)}): {categorical_columns}")
print(f"Numerical features ({len(numerical_columns)}): {numerical_columns}")

# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_processed[column] = le.fit_transform(df_processed[column])
    label_encoders[column] = le
    print(f"✅ Encoded {column}: {list(le.classes_)}")

# Encode target variable
target_encoder = LabelEncoder()
df_processed['Churn'] = target_encoder.fit_transform(df_processed['Churn'])
print(f"✅ Target encoded: {list(target_encoder.classes_)} -> {list(target_encoder.transform(target_encoder.classes_))}")

# Prepare features and target
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

print(f"\n📈 Final dataset shape:")
print(f"Features (X): {X.shape}")
print(f"Target (y): {y.shape}")
print(f"Feature names: {list(X.columns)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Data Split:")
print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Training churn rate: {y_train.mean():.3f}")
print(f"Test churn rate: {y_test.mean():.3f}")

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Scale only numerical columns
numerical_indices = [X.columns.get_loc(col) for col in numerical_columns if col in X.columns]
if numerical_indices:
    X_train_scaled.iloc[:, numerical_indices] = scaler.fit_transform(X_train.iloc[:, numerical_indices])
    X_test_scaled.iloc[:, numerical_indices] = scaler.transform(X_test.iloc[:, numerical_indices])
    print(f"✅ Scaled {len(numerical_indices)} numerical features")
else:
    print("ℹ️ No numerical features to scale")

## 5. Model Training & Evaluation

Train multiple models and compare their performance.

In [None]:
# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train and evaluate models
results = {}
trained_models = {}

print("🤖 Training Models...")
print("=" * 50)

for name, model in models.items():
    print(f"\n🔄 Training {name}...")
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'roc_auc': roc_auc,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }
    
    trained_models[name] = model
    
    print(f"✅ {name} - Accuracy: {accuracy:.4f}, ROC-AUC: {roc_auc:.4f}")

# Create results comparison
results_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[model]['accuracy'] for model in results.keys()],
    'ROC-AUC': [results[model]['roc_auc'] for model in results.keys()]
})

print(f"\n📊 Model Performance Summary:")
display(results_df.sort_values('ROC-AUC', ascending=False))

In [None]:
# Detailed evaluation for the best model
best_model_name = results_df.sort_values('ROC-AUC', ascending=False).iloc[0]['Model']
best_model = trained_models[best_model_name]
best_predictions = results[best_model_name]['predictions']

print(f"🏆 Best Model: {best_model_name}")
print(f"📊 Detailed Classification Report:")
print(classification_report(y_test, best_predictions))

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['No Churn', 'Churn'], 
            yticklabels=['No Churn', 'Churn'])
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('../outputs/plots/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Feature Importance Analysis
if hasattr(best_model, 'feature_importances_'):
    # Tree-based models
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
elif hasattr(best_model, 'coef_'):
    # Linear models
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': abs(best_model.coef_[0])
    }).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(10)
sns.barplot(data=top_features, y='feature', x='importance', palette='viridis')
plt.title(f'Top 10 Feature Importance - {best_model_name}', fontsize=14, fontweight='bold')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout()
plt.savefig('../outputs/plots/feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n🎯 Top 5 Most Important Features:")
for i, row in feature_importance.head(5).iterrows():
    print(f"{row['feature']}: {row['importance']:.4f}")

# Save the best model
model_path = f'../outputs/models/best_churn_model_{best_model_name.lower().replace(" ", "_")}.joblib'
joblib.dump(best_model, model_path)
print(f"\n💾 Best model saved to: {model_path}")

## 6. Business Insights & Recommendations

Based on our analysis, here are the key insights and actionable recommendations for reducing customer churn:

In [None]:
# Business Insights Summary
insights = {
    'Model Performance': f'{best_model_name} achieved {results[best_model_name]["roc_auc"]:.1%} ROC-AUC score',
    'Churn Rate': f'{(y.mean())*100:.1f}% of customers churned',
    'Model Accuracy': f'{results[best_model_name]["accuracy"]:.1%} prediction accuracy'
}

print("🎯 KEY BUSINESS INSIGHTS")
print("=" * 40)

for key, value in insights.items():
    print(f"• {key}: {value}")

print(f"\n📈 TOP CHURN DRIVERS:")
for i, row in feature_importance.head(3).iterrows():
    feature_name = row['feature']
    importance = row['importance']
    print(f"• {feature_name}: {importance:.3f} importance score")

print(f"\n💡 RECOMMENDATIONS:")
recommendations = [
    "🎯 Target high-risk customers identified by the model for retention campaigns",
    "📞 Implement proactive customer service for customers with top risk factors",
    "💰 Consider pricing strategies for customers with high monthly charges",
    "📋 Review contract terms to encourage longer-term commitments",
    "🔧 Improve service quality in areas identified as churn drivers",
    "📊 Monitor model performance monthly and retrain with new data"
]

for rec in recommendations:
    print(rec)

print(f"\n📊 BUSINESS IMPACT ESTIMATION:")
total_customers = len(df)
current_churn_rate = y.mean()
potential_savings = f"Reducing churn by 10% could save ~{int(total_customers * current_churn_rate * 0.1)} customers"
print(f"• {potential_savings}")
print(f"• Model can help prioritize retention efforts for maximum ROI")

print(f"\n🎉 PROJECT SUMMARY:")
print(f"✅ Built and validated {len(models)} machine learning models")
print(f"✅ Achieved {results[best_model_name]['accuracy']:.1%} accuracy with {best_model_name}")
print(f"✅ Identified key churn drivers for business action")
print(f"✅ Created interpretable model for stakeholder understanding")
print(f"✅ Saved best model for future predictions")