# Customer Churn Analysis - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on customer churn data.

**Educational Purpose**: This analysis is for learning and demonstration purposes only.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Import custom utilities
import sys
sys.path.append('..')
import utils

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
# Note: Download the Telco Customer Churn dataset from Kaggle
# https://www.kaggle.com/blastchar/telco-customer-churn

# For demonstration, we'll create sample data
np.random.seed(42)
n_samples = 2000

# Create sample dataset
data = {
    'customerID': [f'C{i:04d}' for i in range(n_samples)],
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'SeniorCitizen': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
    'Partner': np.random.choice(['Yes', 'No'], n_samples),
    'Dependents': np.random.choice(['Yes', 'No'], n_samples, p=[0.3, 0.7]),
    'tenure': np.random.randint(1, 73, n_samples),
    'PhoneService': np.random.choice(['Yes', 'No'], n_samples, p=[0.9, 0.1]),
    'MultipleLines': np.random.choice(['Yes', 'No', 'No phone service'], n_samples, p=[0.4, 0.5, 0.1]),
    'InternetService': np.random.choice(['DSL', 'Fiber optic', 'No'], n_samples, p=[0.4, 0.4, 0.2]),
    'OnlineSecurity': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'OnlineBackup': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'DeviceProtection': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'TechSupport': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'StreamingTV': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'StreamingMovies': np.random.choice(['Yes', 'No', 'No internet service'], n_samples, p=[0.3, 0.5, 0.2]),
    'Contract': np.random.choice(['Month-to-month', 'One year', 'Two year'], n_samples, p=[0.5, 0.3, 0.2]),
    'PaperlessBilling': np.random.choice(['Yes', 'No'], n_samples),
    'PaymentMethod': np.random.choice(['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)'], n_samples),
    'MonthlyCharges': np.random.uniform(18.25, 118.75, n_samples),
    'TotalCharges': np.random.uniform(18.8, 8684.8, n_samples)
}

# Create churn based on realistic patterns
churn_prob = np.zeros(n_samples)
for i in range(n_samples):
    prob = 0.1  # Base probability
    
    # Contract type influence
    if data['Contract'][i] == 'Month-to-month':
        prob += 0.3
    elif data['Contract'][i] == 'One year':
        prob += 0.1
    
    # Tenure influence
    if data['tenure'][i] <= 12:
        prob += 0.2
    elif data['tenure'][i] <= 24:
        prob += 0.1
    
    # Monthly charges influence
    if data['MonthlyCharges'][i] > 80:
        prob += 0.15
    
    # Internet service influence
    if data['InternetService'][i] == 'Fiber optic':
        prob += 0.1
    
    # Payment method influence
    if data['PaymentMethod'][i] == 'Electronic check':
        prob += 0.1
    
    churn_prob[i] = min(prob, 0.8)  # Cap at 80%

data['Churn'] = np.random.binomial(1, churn_prob, n_samples)
data['Churn'] = ['Yes' if x == 1 else 'No' for x in data['Churn']]

df = pd.DataFrame(data)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()

In [None]:
# Display first few rows
df.head()

In [None]:
# Basic statistics
df.describe()

## 2. Data Cleaning and Preprocessing

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Check data types
print("\nData types:")
print(df.dtypes)

In [None]:
# Clean the data using utility function
df_clean = utils.load_and_clean_data(df) if hasattr(utils, 'load_and_clean_data') else df.copy()

# Convert Churn to binary if it's not already
if df_clean['Churn'].dtype == 'object':
    df_clean['Churn'] = df_clean['Churn'].map({'Yes': 1, 'No': 0})

print(f"Cleaned dataset shape: {df_clean.shape}")
print(f"Churn distribution: {df_clean['Churn'].value_counts()}")

## 3. Exploratory Data Analysis

### 3.1 Churn Distribution

In [None]:
# Overall churn rate
churn_rate = df_clean['Churn'].mean()
print(f"Overall churn rate: {churn_rate:.2%}")

# Visualize churn distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Pie chart
churn_counts = df_clean['Churn'].value_counts()
ax1.pie(churn_counts.values, labels=['No Churn', 'Churn'], autopct='%1.1f%%', startangle=90)
ax1.set_title('Churn Distribution')

# Bar chart
churn_counts.plot(kind='bar', ax=ax2, color=['skyblue', 'salmon'])
ax2.set_title('Churn Count')
ax2.set_xlabel('Churn')
ax2.set_ylabel('Count')
ax2.tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

### 3.2 Churn by Categorical Features

In [None]:
# Analyze churn by categorical features
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 
                       'InternetService', 'Contract', 'PaperlessBilling', 'PaymentMethod']

fig, axes = plt.subplots(3, 3, figsize=(18, 15))
axes = axes.ravel()

for i, feature in enumerate(categorical_features):
    if feature in df_clean.columns:
        churn_by_feature = df_clean.groupby(feature)['Churn'].agg(['count', 'sum']).reset_index()
        churn_by_feature['churn_rate'] = churn_by_feature['sum'] / churn_by_feature['count']
        
        churn_by_feature.plot(x=feature, y='churn_rate', kind='bar', ax=axes[i], 
                             color='coral', legend=False)
        axes[i].set_title(f'Churn Rate by {feature}')
        axes[i].set_ylabel('Churn Rate')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 3.3 Churn by Numerical Features

In [None]:
# Analyze numerical features
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

for i, feature in enumerate(numerical_features):
    if feature in df_clean.columns:
        # Distribution by churn
        df_clean[df_clean['Churn'] == 0][feature].hist(alpha=0.7, label='No Churn', 
                                                       bins=30, ax=axes[0, i])
        df_clean[df_clean['Churn'] == 1][feature].hist(alpha=0.7, label='Churn', 
                                                       bins=30, ax=axes[0, i])
        axes[0, i].set_title(f'{feature} Distribution by Churn')
        axes[0, i].legend()
        
        # Box plot
        df_clean.boxplot(column=feature, by='Churn', ax=axes[1, i])
        axes[1, i].set_title(f'{feature} by Churn')

plt.tight_layout()
plt.show()

### 3.4 Correlation Analysis

In [None]:
# Encode categorical variables for correlation analysis
df_encoded, encoders = utils.encode_categorical_features(df_clean)

# Calculate correlation matrix
correlation_matrix = df_encoded.select_dtypes(include=[np.number]).corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Features most correlated with churn
churn_correlations = correlation_matrix['Churn'].abs().sort_values(ascending=False)
print("Features most correlated with churn:")
print(churn_correlations.head(10))

### 3.5 Advanced Analysis

In [None]:
# Tenure vs Monthly Charges scatter plot
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_clean['tenure'], df_clean['MonthlyCharges'], 
                     c=df_clean['Churn'], alpha=0.6, cmap='coolwarm')
plt.colorbar(scatter, label='Churn (0=No, 1=Yes)')
plt.xlabel('Tenure (months)')
plt.ylabel('Monthly Charges ($)')
plt.title('Tenure vs Monthly Charges by Churn Status')
plt.show()

In [None]:
# Churn rate by tenure groups
df_clean['tenure_group'] = pd.cut(df_clean['tenure'], 
                                 bins=[0, 12, 24, 36, 48, 72], 
                                 labels=['0-12', '13-24', '25-36', '37-48', '49-72'])

tenure_churn = df_clean.groupby('tenure_group')['Churn'].agg(['count', 'sum']).reset_index()
tenure_churn['churn_rate'] = tenure_churn['sum'] / tenure_churn['count']

plt.figure(figsize=(10, 6))
plt.bar(tenure_churn['tenure_group'], tenure_churn['churn_rate'], color='lightcoral')
plt.xlabel('Tenure Group (months)')
plt.ylabel('Churn Rate')
plt.title('Churn Rate by Tenure Groups')
plt.xticks(rotation=45)
plt.show()

## 4. Key Insights and Findings

In [None]:
# Generate business insights
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS:")
print("=" * 50)

print(f"1. Overall churn rate: {df_clean['Churn'].mean():.1%}")

# Contract analysis
if 'Contract' in df_clean.columns:
    contract_churn = df_clean.groupby('Contract')['Churn'].mean()
    print(f"\n2. Churn by Contract Type:")
    for contract, rate in contract_churn.items():
        print(f"   - {contract}: {rate:.1%}")

# Tenure analysis
if 'tenure' in df_clean.columns:
    new_customers_churn = df_clean[df_clean['tenure'] <= 12]['Churn'].mean()
    print(f"\n3. New customers (≤12 months) churn rate: {new_customers_churn:.1%}")

# Payment method analysis
if 'PaymentMethod' in df_clean.columns:
    payment_churn = df_clean.groupby('PaymentMethod')['Churn'].mean().sort_values(ascending=False)
    print(f"\n4. Highest risk payment method: {payment_churn.index[0]} ({payment_churn.iloc[0]:.1%})")

# Monthly charges analysis
if 'MonthlyCharges' in df_clean.columns:
    high_charges_churn = df_clean[df_clean['MonthlyCharges'] > df_clean['MonthlyCharges'].quantile(0.75)]['Churn'].mean()
    print(f"\n5. High monthly charges (top 25%) churn rate: {high_charges_churn:.1%}")

print("\nRECOMMENDations:")
print("=" * 20)
print("- Focus retention efforts on month-to-month contract customers")
print("- Implement early intervention programs for new customers")
print("- Review pricing strategy for high-charge customers")
print("- Encourage automatic payment methods")
print("- Develop loyalty programs for long-term customers")

## 5. Data Export for Modeling

In [None]:
# Save cleaned data for modeling
df_clean.to_csv('../data/cleaned_churn_data.csv', index=False)
print("Cleaned data saved to '../data/cleaned_churn_data.csv'")

# Save encoded data for modeling
df_encoded.to_csv('../data/encoded_churn_data.csv', index=False)
print("Encoded data saved to '../data/encoded_churn_data.csv'")