# Customer Churn Prediction with Clustering and ML Models

This notebook performs a full analysis on customer churn prediction using Logistic Regression and Random Forest. It includes EDA, feature engineering, clustering, model evaluation, and strategic recommendations.

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score

## Step 1: Load the Dataset

In [None]:
data = pd.read_csv(r"C:\Users\veera\Downloads\Bank Customer Churn Prediction.csv")
data.columns = data.columns.str.strip()
print(data.columns)

## Step 2: Exploratory Data Analysis

In [None]:
# Check for missing values and basic statistics
print(data.isnull().sum())
print(data.describe())

# Churn distribution
if 'churn' in data.columns:
    data['churn'].value_counts().plot(kind='bar')
    plt.title('Churn Distribution')
    plt.xlabel('Churn Status')
    plt.ylabel('Count')
    plt.show()
else:
    print("Churn column is missing!")

In [None]:
# Boxplot: Age vs Churn
sns.boxplot(x='churn', y='Age', data=data)
plt.title('Age Distribution by Churn Status')
plt.show()

## Step 3: Data Preprocessing

In [None]:
# Drop customer ID if present
data = data.drop(['customerID'], axis=1, errors='ignore')

# Convert churn column to binary if needed
if data['churn'].dtype == 'object':
    data['churn'] = data['churn'].map({'Yes': 1, 'No': 0})

# Fill missing values
data.fillna(data.mean(), inplace=True)

# One-hot encode categorical columns
data = pd.get_dummies(data, drop_first=True)

## Step 4: Customer Segmentation using KMeans

In [None]:
scaler = StandardScaler()
features = data.drop(['churn'], axis=1)
scaled_features = scaler.fit_transform(features)

kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_features)

sns.scatterplot(x=data['tenure'], y=data['MonthlyCharges'], hue=data['Cluster'], palette='viridis')
plt.title('Customer Segments')
plt.show()

## Step 5: Train-Test Split

In [None]:
X = data.drop('churn', axis=1)
y = data['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Step 6: Logistic Regression Model

In [None]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, lr_model.predict_proba(X_test)[:, 1])

## Step 7: Random Forest Model

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1])

## Step 8: Model Evaluation & Comparison

In [None]:
print(f"Logistic Regression - Accuracy: {lr_acc:.2f}, ROC-AUC: {lr_auc:.2f}")
print(f"Random Forest - Accuracy: {rf_acc:.2f}, ROC-AUC: {rf_auc:.2f}")

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Pred No', 'Pred Yes'],
            yticklabels=['Actual No', 'Actual Yes'])
plt.title('Random Forest Confusion Matrix')
plt.show()

In [None]:
fpr, tpr, _ = roc_curve(y_test, rf_model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'Random Forest (AUC = {roc_auc:.2f}')
plt.plot([0,1],[0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## Step 9: Feature Importance & Strategy Suggestion

In [None]:
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
top_features = importances.nlargest(5)
top_features.plot(kind='barh')
plt.title('Top 5 Churn Predictors')
plt.show()

print("\nStrategy Suggestion:")
print("Focus on high-risk customers with low tenure and high monthly charges.")
print("Introduce loyalty rewards or personalized offers to retain these customers and reduce churn by an estimated 20%.")