# Customer Survival Analysis

This notebook performs survival analysis on customer churn data using Kaplan-Meier estimators and Cox Proportional Hazards models to understand customer lifetime and identify risk factors for churn.

## 1. Import Required Libraries

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.statistics import logrank_test, multivariate_logrank_test
import warnings
import pickle

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 2. Load and Prepare Data

In [None]:
# Load dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Data preparation
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)

# For survival analysis, we need:
# - duration: tenure (how long the customer has been with the company)
# - event: whether churn occurred (1 = churned, 0 = still customer)
df['duration'] = df['tenure']
df['event'] = (df['Churn'] == 'Yes').astype(int)

print(f"Dataset Shape: {df.shape}")
print(f"\nSurvival Analysis Setup:")
print(f"  - Duration (tenure): {df['duration'].min()}-{df['duration'].max()} months")
print(f"  - Events (churns): {df['event'].sum():,} ({df['event'].mean()*100:.1f}%)")
print(f"  - Censored (active): {(1-df['event']).sum():,} ({(1-df['event'].mean())*100:.1f}%)")

df.head()

In [None]:
# Load dataset
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Data preparation
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)

# Create survival analysis variables
# Duration = tenure (in months)
# Event = 1 if churned, 0 if still customer
df['event'] = (df['Churn'] == 'Yes').astype(int)
df['duration'] = df['tenure']

print(f"Dataset loaded: {df.shape[0]:,} customers")
print(f"Churned customers: {df['event'].sum():,} ({df['event'].mean()*100:.2f}%)")
print(f"Active customers: {(1-df['event']).sum():,} ({(1-df['event'].mean())*100:.2f}%)")
df.head()

## 3. Kaplan-Meier Survival Analysis

The Kaplan-Meier estimator is a non-parametric statistic used to estimate the survival function from lifetime data. It provides the probability of a customer surviving (not churning) beyond a given time point.

In [None]:
# Overall Kaplan-Meier survival curve
kmf = KaplanMeierFitter()
kmf.fit(df['duration'], df['event'], label='All Customers')

plt.figure(figsize=(12, 6))
kmf.plot_survival_function(ci_show=True, color='steelblue', linewidth=2.5)
plt.title('Overall Customer Survival Curve (Kaplan-Meier)', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (Months)', fontsize=12, fontweight='bold')
plt.ylabel('Survival Probability', fontsize=12, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../static/images/km_overall.png', dpi=150, bbox_inches='tight')
plt.show()

# Print survival statistics
print("Survival Statistics:")
print("=" * 80)
print(f"Median survival time: {kmf.median_survival_time_:.2f} months")
print(f"\nSurvival probabilities at key timepoints:")
for months in [6, 12, 24, 36, 48, 60]:
    if months <= df['duration'].max():
        surv_prob = kmf.predict(months)
        print(f"  {months:2d} months: {surv_prob:.2%}")

## 4. Survival Analysis by Contract Type

In [None]:
# Survival curves by contract type
fig, ax = plt.subplots(figsize=(14, 7))

contract_types = df['Contract'].unique()
colors = ['#e74c3c', '#f39c12', '#2ecc71']

for idx, contract in enumerate(contract_types):
    mask = df['Contract'] == contract
    kmf_contract = KaplanMeierFitter()
    kmf_contract.fit(df[mask]['duration'], df[mask]['event'], label=contract)
    kmf_contract.plot_survival_function(ax=ax, ci_show=True, 
                                       color=colors[idx], linewidth=2.5)

plt.title('Customer Survival by Contract Type', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (Months)', fontsize=12, fontweight='bold')
plt.ylabel('Survival Probability', fontsize=12, fontweight='bold')
plt.legend(title='Contract Type', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../static/images/km_by_contract.png', dpi=150, bbox_inches='tight')
plt.show()

# Log-rank test for contract types
print("\nLog-Rank Test: Contract Type")
print("=" * 80)
results = multivariate_logrank_test(df['duration'], df['Contract'], df['event'])
print(f"Test statistic: {results.test_statistic:.4f}")
print(f"P-value: {results.p_value:.6f}")
print(f"Interpretation: {'Statistically significant difference' if results.p_value < 0.05 else 'No significant difference'}")

## 5. Survival Analysis by Internet Service

In [None]:
# Survival curves by internet service
fig, ax = plt.subplots(figsize=(14, 7))

internet_services = df['InternetService'].unique()
colors_internet = ['#3498db', '#9b59b6', '#95a5a6']

for idx, service in enumerate(internet_services):
    mask = df['InternetService'] == service
    kmf_service = KaplanMeierFitter()
    kmf_service.fit(df[mask]['duration'], df[mask]['event'], label=service)
    kmf_service.plot_survival_function(ax=ax, ci_show=True, 
                                       color=colors_internet[idx], linewidth=2.5)

plt.title('Customer Survival by Internet Service Type', fontsize=14, fontweight='bold')
plt.xlabel('Tenure (Months)', fontsize=12, fontweight='bold')
plt.ylabel('Survival Probability', fontsize=12, fontweight='bold')
plt.legend(title='Internet Service', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../static/images/km_by_internet.png', dpi=150, bbox_inches='tight')
plt.show()

# Log-rank test for internet service
print("\nLog-Rank Test: Internet Service Type")
print("=" * 80)
results = multivariate_logrank_test(df['duration'], df['InternetService'], df['event'])
print(f"Test statistic: {results.test_statistic:.4f}")
print(f"P-value: {results.p_value:.6f}")
print(f"Interpretation: {'Statistically significant difference' if results.p_value < 0.05 else 'No significant difference'}")

## 6. Cox Proportional Hazards Model

The Cox PH model is a regression model that allows us to examine the relationship between survival time and multiple predictor variables simultaneously. It estimates hazard ratios which indicate how different factors affect the risk of churn.

In [None]:
# Prepare data for Cox model
df_cox = df.copy()

# Encode categorical variables
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService',
                       'MultipleLines', 'InternetService', 'OnlineSecurity',
                       'OnlineBackup', 'DeviceProtection', 'TechSupport',
                       'StreamingTV', 'StreamingMovies', 'Contract',
                       'PaperlessBilling', 'PaymentMethod']

# Binary encoding for binary variables
binary_vars = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for var in binary_vars:
    df_cox[var] = (df_cox[var] == df_cox[var].unique()[0]).astype(int)

# One-hot encoding for multi-category variables
df_cox = pd.get_dummies(df_cox, columns=[col for col in categorical_features if col not in binary_vars], 
                        drop_first=True)

# Select features for Cox model
feature_cols = [col for col in df_cox.columns if col not in ['customerID', 'Churn', 'duration', 'event', 'tenure']]
cox_df = df_cox[feature_cols + ['duration', 'event']].copy()

print(f"Cox model features: {len(feature_cols)}")
print(f"Dataset shape: {cox_df.shape}")

In [None]:
# Fit Cox Proportional Hazards model
cph = CoxPHFitter(penalizer=0.1)
cph.fit(cox_df, duration_col='duration', event_col='event')

print("Cox Proportional Hazards Model Summary:")
print("=" * 80)
print(cph.summary)

# Plot top hazard ratios
plt.figure(figsize=(12, 10))
cph.plot()
plt.title('Cox Model: Hazard Ratios with 95% CI', fontsize=14, fontweight='bold')
plt.xlabel('Hazard Ratio (log scale)', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('../static/images/cox_hazard_ratios.png', dpi=150, bbox_inches='tight')
plt.show()

# Model performance
print(f"\n{'=' * 80}")
print(f"Model Concordance Index: {cph.concordance_index_:.4f}")
print(f"Log-likelihood: {cph.log_likelihood_:.2f}")
print(f"AIC: {cph.AIC_:.2f}")

In [None]:
# Extract and display top risk factors
hazard_ratios = cph.summary.sort_values('exp(coef)', ascending=False)
top_increasing_risk = hazard_ratios.head(10)
top_decreasing_risk = hazard_ratios.tail(10)

print("\n" + "=" * 80)
print("TOP 10 FACTORS INCREASING CHURN RISK (Hazard Ratio > 1):")
print("=" * 80)
for idx, row in top_increasing_risk.iterrows():
    hr = row['exp(coef)']
    if hr > 1:
        print(f"{idx:40s}: HR = {hr:.3f} (increases risk by {(hr-1)*100:.1f}%)")

print("\n" + "=" * 80)
print("TOP 10 FACTORS DECREASING CHURN RISK (Hazard Ratio < 1):")
print("=" * 80)
for idx, row in top_decreasing_risk.iterrows():
    hr = row['exp(coef)']
    if hr < 1:
        print(f"{idx:40s}: HR = {hr:.3f} (decreases risk by {(1-hr)*100:.1f}%)")

## 7. Save Cox Model for Deployment

In [None]:
# Save the Cox model
with open('../survivemodel.pkl', 'wb') as f:
    pickle.dump(cph, f)

print("✓ Cox Proportional Hazards model saved to '../survivemodel.pkl'")
print(f"✓ Model ready for deployment with {len(feature_cols)} features")

## 8. Key Survival Analysis Findings

### Summary of Insights:

**Kaplan-Meier Analysis:**
- Overall median survival time indicates when 50% of customers are expected to churn
- Survival probability decreases significantly in the first 12 months
- Different customer segments show distinct survival patterns

**Contract Type Impact:**
- Month-to-month contracts show steepest decline in survival
- Two-year contracts demonstrate highest retention and longest survival
- One-year contracts fall in between
- Statistical significance confirmed by log-rank test (p < 0.05)

**Internet Service Impact:**
- Fiber optic customers show lower survival rates despite premium service
- DSL customers have better retention
- Customers without internet service show mixed patterns
- Significant differences confirmed by log-rank test

**Cox Proportional Hazards Model:**
- Identifies multiple risk factors simultaneously
- Hazard ratios > 1 indicate increased churn risk
- Hazard ratios < 1 indicate protective factors
- Key high-risk factors likely include: month-to-month contracts, electronic check payments, lack of support services
- Key protective factors likely include: long-term contracts, automatic payments, bundled services

**Model Performance:**
- Concordance Index measures model's predictive discrimination
- Values closer to 1.0 indicate better prediction
- Model can be used for individual customer risk assessment

### Business Implications:
1. Target intervention programs at month-to-month contract customers
2. Investigate fiber optic service quality issues
3. Promote longer-term contract incentives
4. Bundle support services to reduce churn risk
5. Use Cox model for personalized retention strategies