# Imports

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score
from imblearn.combine import SMOTEENN
from scipy.stats import uniform, randint
import plotly.graph_objs as go
import matplotlib as mpl
import matplotlib.patches as mpatches
from plotly import tools
from plotly.subplots import make_subplots
from plotly.offline import iplot
import kagglehub
import shutil
import os
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

# Downloading Dataset

In [2]:
path = kagglehub.dataset_download("blastchar/telco-customer-churn")
content_dir = '/content'

shutil.move(path, os.path.join(content_dir, os.path.basename(path)))
print("Dataset moved to content directory:", os.path.join(content_dir, os.path.basename(path)))

Downloading from https://www.kaggle.com/api/v1/datasets/download/blastchar/telco-customer-churn?dataset_version_number=1...


100%|██████████| 172k/172k [00:00<00:00, 520kB/s]

Extracting files...
Dataset moved to content directory: /content/1





In [3]:
df = pd.read_csv("/content/1/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


# Data Preprocessing

In [4]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

In [5]:
df = df.apply(lambda x: object_to_int(x))

In [6]:
plt.figure(figsize=(14,7))

<Figure size 1400x700 with 0 Axes>

<Figure size 1400x700 with 0 Axes>

In [7]:
df = df.drop(['customerID'], axis = 1)

In [8]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')

In [9]:
df[np.isnan(df['TotalCharges'])]

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [10]:
df[df['tenure'] == 0].index

Index([488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754], dtype='int64')

In [11]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df[df['tenure'] == 0].index

Index([], dtype='int64')

In [12]:
df.fillna(df["TotalCharges"].mean())

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.30,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.70,925,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,1,0,1,1,24,1,2,0,2,0,2,2,2,2,1,1,3,84.80,1597,0
7039,0,0,1,1,72,1,2,1,0,2,2,0,2,2,1,1,1,103.20,5698,0
7040,0,0,1,1,11,0,1,0,2,0,0,0,0,0,0,1,2,29.60,2994,0
7041,1,1,1,0,4,1,2,1,0,0,0,0,0,0,0,1,3,74.40,2660,1


In [13]:
df.isnull().sum()

Unnamed: 0,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0
OnlineBackup,0


In [14]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0
mean,32.421786,64.798208,3262.890216
std,24.54526,30.085974,1885.766048
min,1.0,18.25,1.0
25%,9.0,35.5875,1610.0
50%,29.0,70.35,3254.5
75%,55.0,89.8625,4904.25
max,72.0,118.75,6530.0


In [15]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values

In [16]:
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']

# Train/Test Split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)

# Training Models Sequentially

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.30, random_state = 40, stratify=y)

model_scores = []

models = [
    ('Random Forest', RandomForestClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__max_depth': [None, 10, 20]}),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__learning_rate': [0.05, 0.1, 0.5]}),
    ('Support Vector Machine', SVC(random_state=42, class_weight='balanced'),
        {'model__C': [0.1, 1, 10],
         'model__gamma': ['scale', 'auto']}),
    ('Logistic Regression', LogisticRegression(random_state=42, class_weight='balanced'),
        {'model__C': [0.1, 1, 10],
         'model__penalty': ['l1', 'l2']}),
    ('K-Nearest Neighbors', KNeighborsClassifier(),
        {'model__n_neighbors': [3, 5, 7],
         'model__weights': ['uniform', 'distance']}),
    ('Decision Tree', DecisionTreeClassifier(random_state=42),
        {'model__max_depth': [None, 10, 20],
         'model__min_samples_split': [2, 5, 10]}),
    ('AdaBoost', AdaBoostClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__learning_rate': [0.05, 0.1, 0.5]}),
    ('XG Boost', XGBClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200],
         'model__learning_rate': [0.05, 0.1, 0.5]}),
    ('Naive Bayes', GaussianNB(), {})
]

best_model = None
best_accuracy = 0.0

for name, model, param_grid in models:
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', model)
    ])

    if param_grid:
        grid_search = GridSearchCV(pipeline, param_grid, cv=2)
        grid_search.fit(X_train, y_train)
        pipeline = grid_search.best_estimator_

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_scores.append({'Model': name, 'Accuracy': accuracy})

    print("Model:", name)
    print("Test Accuracy:", round(accuracy, 3), "%")
    print()


    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

print("Best Model:")
print("Test Accuracy:", best_accuracy)
print("Model Pipeline:", best_model, "with accuracy", round(best_accuracy, 2), "%")

Model: Random Forest
Test Accuracy: 0.809 %

Model: Gradient Boosting
Test Accuracy: 0.803 %

Model: Support Vector Machine
Test Accuracy: 0.746 %

Model: Logistic Regression
Test Accuracy: 0.75 %

Model: K-Nearest Neighbors
Test Accuracy: 0.755 %

Model: Decision Tree
Test Accuracy: 0.763 %

Model: AdaBoost
Test Accuracy: 0.809 %

Model: XG Boost
Test Accuracy: 0.803 %

Model: Naive Bayes
Test Accuracy: 0.76 %

Best Model:
Test Accuracy: 0.809478672985782
Model Pipeline: Pipeline(steps=[('scaler', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=10, random_state=42))]) with accuracy 0.81 %


# Creating a Voting classifier

In [19]:
models = {
    'rf': RandomForestClassifier(random_state=42),
    'gb': GradientBoostingClassifier(random_state=42),
    'ab': AdaBoostClassifier(random_state=42),
    'xgb': XGBClassifier(random_state=42)
}

param_grids = {
    'rf': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'gb': {'n_estimators': [50, 100, 200], 'learning_rate': [0.05, 0.1, 0.5]},
    'ab': {'n_estimators': [50, 100, 200], 'learning_rate': [0.05, 0.1, 0.5]},
    'xgb': {'n_estimators': [50, 100, 200], 'learning_rate': [0.05, 0.1, 0.5]}
}

pipelines = {}
for name, model in models.items():
    if param_grids[name]:
        pipelines[name] = GridSearchCV(model, param_grids[name], cv=2)
    else:
        pipelines[name] = model


for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)


voting_clf = VotingClassifier(estimators=[
    ('rf', pipelines['rf'].best_estimator_),
    ('gb', pipelines['gb'].best_estimator_),
    ('ab', pipelines['ab'].best_estimator_),
    ('xgb', pipelines['xgb'].best_estimator_),
], voting='soft')

voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Voting Classifier Test Accuracy: {round(accuracy, 3) * 100}%")

Voting Classifier Test Accuracy: 80.9%


#### Using SMOTEENN (Synthetic Minority Over-sampling Technique with Edited Nearest Neighbors) to cater class imbalance

In [20]:
sm = SMOTEENN()
X_res, y_res = sm.fit_resample(X, y)

Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_res, y_res, test_size=0.2)

In [21]:
model_scores = []

models = [
    ('Random Forest', RandomForestClassifier(random_state=42),
        {'n_estimators': [50, 100, 200],
         'max_depth': [None, 10, 20]}),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42),
        {'n_estimators': [50, 100, 200],
         'learning_rate': [0.05, 0.1, 0.5]}),
    ('Support Vector Machine', SVC(random_state=42, class_weight='balanced'),
        {'C': [0.1, 1, 10],
         'gamma': ['scale', 'auto']}),
    ('Logistic Regression', LogisticRegression(random_state=42, class_weight='balanced'),
        {'C': [0.1, 1, 10],
         'penalty': ['l1', 'l2']}),
    ('K-Nearest Neighbors', KNeighborsClassifier(),
        {'n_neighbors': [3, 5, 7],
         'weights': ['uniform', 'distance']}),
    ('Decision Tree', DecisionTreeClassifier(random_state=42),
        {'max_depth': [None, 10, 20],
         'min_samples_split': [2, 5, 10]}),
    ('Ada Boost', AdaBoostClassifier(random_state=42),
        {'n_estimators': [50, 100, 200],
         'learning_rate': [0.05, 0.1, 0.5]}),
    ('XG Boost', XGBClassifier(random_state=42),
        {'max_depth': randint(3, 6),
         'learning_rate': uniform(0.01, 0.2),
         'n_estimators': randint(100, 300),
         'subsample': uniform(0.8, 0.2)}),
    ('Naive Bayes', GaussianNB(), {})
]

best_model = None
best_accuracy = 0.0

for name, model, param_grid in models:
    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('model', model)
    ])
    if name == 'XG Boost':
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                           n_iter=100, cv=3, verbose=0, random_state=42, n_jobs=-1)
        random_search.fit(Xr_train, yr_train)
        pipeline = random_search.best_estimator_

    elif param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=2, verbose=0)
        grid_search.fit(Xr_train, yr_train)
        pipeline = grid_search.best_estimator_

    pipeline.fit(Xr_train, yr_train)

    y_pred = pipeline.predict(Xr_test)
    accuracy = accuracy_score(yr_test, y_pred)

    model_scores.append({'Model': name, 'Accuracy': accuracy})

    scores_df = pd.DataFrame(model_scores)

    print("Model:", name)
    print("Test Accuracy:", round(accuracy, 3),"%")
    print()

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline

print("Best Model:")
print("Test Accuracy:", best_accuracy)
print("Model Pipeline:", best_model, "with accuracy",  round(best_accuracy, 2), "%")

Model: Random Forest
Test Accuracy: 0.947 %

Model: Gradient Boosting
Test Accuracy: 0.943 %

Model: Support Vector Machine
Test Accuracy: 0.852 %

Model: Logistic Regression
Test Accuracy: 0.915 %

Model: K-Nearest Neighbors
Test Accuracy: 0.98 %

Model: Decision Tree
Test Accuracy: 0.915 %

Model: Ada Boost
Test Accuracy: 0.927 %

Model: XG Boost
Test Accuracy: 0.954 %

Model: Naive Bayes
Test Accuracy: 0.893 %

Best Model:
Test Accuracy: 0.9796954314720813
Model Pipeline: KNeighborsClassifier(n_neighbors=3, weights='distance') with accuracy 0.98 %
