Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

Load Preprocessed dataset

In [2]:
df = pd.read_csv("/content/Preprocessed_classification.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,1,0,0,0,34,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
2,1,0,0,0,2,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,45,0,0,1,0,1,...,1,0,0,0,1,0,1,0,0,0
4,0,0,0,0,2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 27 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   Partner                                  7043 non-null   int64  
 3   Dependents                               7043 non-null   int64  
 4   tenure                                   7043 non-null   int64  
 5   PhoneService                             7043 non-null   int64  
 6   MultipleLines                            7043 non-null   int64  
 7   OnlineSecurity                           7043 non-null   int64  
 8   OnlineBackup                             7043 non-null   int64  
 9   DeviceProtection                         7043 non-null   int64  
 10  TechSupport                              7043 no

Checking for Imbalaces

In [4]:
print(df['Churn'].value_counts(normalize=True))

Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


Assign Target Column

In [5]:
# Features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

Split dataset

In [None]:
# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

Applying SMOTE

In [12]:
# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Check the new class distribution
print("Original training set class distribution:\n", y_train.value_counts())
print("Resampled training set class distribution:\n", y_train_res.value_counts())


Original training set class distribution:
 Churn
0    4139
1    1495
Name: count, dtype: int64
Resampled training set class distribution:
 Churn
0    4139
1    4139
Name: count, dtype: int64


Decision Tree

In [13]:
# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train
dt_model.fit(X_train_res, y_train_res)

# Predict
y_pred_dt = dt_model.predict(X_test)

# Evaluation
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.7253371185237757
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.79      0.81      1035
           1       0.48      0.54      0.51       374

    accuracy                           0.73      1409
   macro avg       0.66      0.67      0.66      1409
weighted avg       0.74      0.73      0.73      1409



MLP Classifier

In [16]:
# Initialize MLP
mlp_model = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, random_state=42)

# Train
mlp_model.fit(X_train_res_scaled, y_train_res)

# Predict
y_pred_mlp = mlp_model.predict(X_test_scaled)

# Evaluation
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))


MLP Accuracy: 0.7849538679914834
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.61      0.53      0.57       374

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.78      0.78      0.78      1409



Random Forest

In [14]:
# Initialize model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train
rf_model.fit(X_train_res, y_train_res)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Evaluation
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.7821149751596878
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1035
           1       0.59      0.58      0.59       374

    accuracy                           0.78      1409
   macro avg       0.72      0.72      0.72      1409
weighted avg       0.78      0.78      0.78      1409



Logistic Regression

In [15]:
# Scale numeric columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
X_train_res_scaled = X_train_res.copy()
X_test_scaled = X_test.copy()
X_train_res_scaled[numeric_cols] = scaler.fit_transform(X_train_res[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Initialize Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)

# Train
lr_model.fit(X_train_res_scaled, y_train_res)

# Predict
y_pred_lr = lr_model.predict(X_test_scaled)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.7892122072391767
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.85      1035
           1       0.60      0.64      0.62       374

    accuracy                           0.79      1409
   macro avg       0.73      0.74      0.74      1409
weighted avg       0.79      0.79      0.79      1409



Model Evaluation Report

In [17]:
# Calculate metrics for each model
models = {
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "Logistic Regression": y_pred_lr,
    "MLP": y_pred_mlp
}

results = []

for name, y_pred in models.items():
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": round(acc, 3),
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1-Score": round(f1, 3)
    })

# Create a DataFrame for easy visualization
results_df = pd.DataFrame(results)
print("Model Performance Comparison:")
print(results_df)


Model Performance Comparison:
                 Model  Accuracy  Precision  Recall  F1-Score
0        Decision Tree     0.725      0.484   0.540     0.511
1        Random Forest     0.782      0.591   0.583     0.587
2  Logistic Regression     0.789      0.596   0.636     0.616
3                  MLP     0.785      0.610   0.527     0.565


In [6]:
# Scale numeric columns for Logistic Regression and MLP
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
scaler = StandardScaler()
X_train_res_scaled = X_train_res.copy()
X_test_scaled = X_test.copy()
X_train_res_scaled[numeric_cols] = scaler.fit_transform(X_train_res[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


Training Decision Tree with Hyperparameter tuning

In [7]:
# Define parameter grid
dt_params = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search
dt_grid = GridSearchCV(DecisionTreeClassifier(random_state=42),
                       dt_params, cv=5, scoring='f1', n_jobs=-1)
dt_grid.fit(X_train_res, y_train_res)

# Best model
best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)

print("Best Decision Tree Params:", dt_grid.best_params_)


Best Decision Tree Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}


Training Neural Network with Hyperparameter tuning

In [10]:
mlp_params = {
    'hidden_layer_sizes': [(64,32), (128,64), (64,64,32)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

mlp_grid = GridSearchCV(MLPClassifier(max_iter=500, random_state=42),
                        mlp_params, cv=3, scoring='f1', n_jobs=-1)
mlp_grid.fit(X_train_res_scaled, y_train_res)

best_mlp = mlp_grid.best_estimator_
y_pred_mlp = best_mlp.predict(X_test_scaled)

print("Best MLP Params:", mlp_grid.best_params_)


Best MLP Params: {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (64, 64, 32), 'learning_rate': 'constant'}


Training Random Forest with Hyperparameter tuning

In [8]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42),
                       rf_params, cv=3, scoring='f1', n_jobs=-1)
rf_grid.fit(X_train_res, y_train_res)

best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)

print("Best Random Forest Params:", rf_grid.best_params_)


Best Random Forest Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


Training Logistic Regression with Hyperparameter tuning

In [9]:
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],  # 'l1' can be used with solver='liblinear'
    'solver': ['lbfgs']
}

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42),
                       lr_params, cv=5, scoring='f1', n_jobs=-1)
lr_grid.fit(X_train_res_scaled, y_train_res)

best_lr = lr_grid.best_estimator_
y_pred_lr = best_lr.predict(X_test_scaled)

print("Best Logistic Regression Params:", lr_grid.best_params_)


Best Logistic Regression Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}


Model evaluation with Hyperparameter tuning

In [11]:
models = {
    "Decision Tree": y_pred_dt,
    "Random Forest": y_pred_rf,
    "Logistic Regression": y_pred_lr,
    "MLP": y_pred_mlp
}

results = []

for name, y_pred in models.items():
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": round(acc,3),
        "Precision": round(prec,3),
        "Recall": round(rec,3),
        "F1-Score": round(f1,3)
    })

results_df = pd.DataFrame(results)
print("Model Performance Comparison (after Hyperparameter Tuning):")
print(results_df)


Model Performance Comparison (after Hyperparameter Tuning):
                 Model  Accuracy  Precision  Recall  F1-Score
0        Decision Tree     0.749      0.521   0.658     0.582
1        Random Forest     0.764      0.544   0.676     0.603
2  Logistic Regression     0.776      0.568   0.655     0.609
3                  MLP     0.752      0.532   0.537     0.535


Comparisons

In [12]:
# Find best model based on F1-score
best_f1_model = results_df.loc[results_df['F1-Score'].idxmax()]
print("Best model based on F1-Score:")
print(best_f1_model)

# Find best model based on Recall
best_recall_model = results_df.loc[results_df['Recall'].idxmax()]
print("\nBest model based on Recall:")
print(best_recall_model)


Best model based on F1-Score:
Model        Logistic Regression
Accuracy                   0.776
Precision                  0.568
Recall                     0.655
F1-Score                   0.609
Name: 2, dtype: object

Best model based on Recall:
Model        Random Forest
Accuracy             0.764
Precision            0.544
Recall               0.676
F1-Score             0.603
Name: 1, dtype: object
