In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings('ignore')

## 1. Data Loading & Preprocessing

In [2]:
# Load the dataset
data = pd.read_csv(r'D:\Data Science Projects\Customer Churn Risk Scoring System\NoteBook\Data\Telco-Customer-Churn.csv')

# Clean TotalCharges
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Handling the missing values
data.dropna(subset=['TotalCharges'], inplace=True)

#  Drop customerID
data.drop('customerID', axis=1, inplace=True)

# Define the columns for the pipelines
numerical_cols = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 
    'PaperlessBilling', 'PaymentMethod'
]

In [5]:
# Define preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], remainder='passthrough')

X = data.drop(columns=['Churn']) 

# Prepare the target variable (Convert 'Yes'/'No' to 1/0)
y = data['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

## 2. Model Training & Cross Validation

In [6]:
# Split and Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boost': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(verbose=-1),
    'CatBoost': CatBoostClassifier(verbose=0)
}

results = {}
for name, model in models.items():
    # Create the full pipeline for this specific model
    full_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Perform 5-fold Cross-Validation
    cv_scores = cross_val_score(full_pipeline, X_train, y_train, cv=5, scoring='accuracy')
    results[name] = cv_scores.mean()
    
    print(f'{name} Average CV Accuracy: {cv_scores.mean():.4f}')

Logistic Regression Average CV Accuracy: 0.8068
SVM Average CV Accuracy: 0.8005
Random Forest Average CV Accuracy: 0.7956
Gradient Boost Average CV Accuracy: 0.8027
XGBoost Average CV Accuracy: 0.7879
LightGBM Average CV Accuracy: 0.8004
CatBoost Average CV Accuracy: 0.8009


## 3. Hyperparameter Tuning
We will tune Top 3 models to squeeze out better performance.