# 📊 Customer Churn Prediction Using Machine Learning
This project aims to predict customer churn using the Telco Customer Churn dataset. We will go through data preprocessing, exploratory data analysis (EDA), model training, and evaluation.


In [None]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('Telco-Customer-Churn.csv')

# Clean column names
df.columns = df.columns.str.strip()

# Display first few rows
df.head()

In [None]:
# Drop customerID column if present
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)

# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

# Encode target variable
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Countplot for Churn
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')
plt.show()

# Correlation Heatmap
numeric_df = df.select_dtypes(include='number')
plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode all categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')

In [None]:
# Feature Importance
importances = model.feature_importances_
features = X.columns

feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10))
plt.title('Top 10 Feature Importances')
plt.show()