In [74]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import pickle

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df = data.copy()

In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
df.isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [7]:
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

In [8]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [9]:
df.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

In [10]:
categorical_fts = []
for col in df.columns:
    if df[col].dtype == 'object':
        categorical_fts.append(col)

In [11]:
categorical_fts

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [12]:
numerical_fts = [col for col in df.columns if col not in categorical_fts]

In [13]:
numerical_fts

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [14]:
df.drop(columns=['customerID'], inplace=True, axis=1)

In [15]:
categorical_fts.remove('customerID')

In [16]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [18]:
encoder = LabelEncoder()

In [76]:
# Fit encoder on categorical features
encoders = {}
for col in categorical_fts:
    encoders[col] = LabelEncoder()  # Change to OneHotEncoder() if used
    df[col] = encoders[col].fit_transform(df[col])

# Save encoders with feature names and classes
encoder_info = {"encoders": encoders, "features": categorical_fts}

with open("encoder.pkl", "wb") as file:
    pickle.dump(encoder_info, file)

In [21]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1


In [22]:
numerical_fts.remove('SeniorCitizen')

In [75]:
scaler = StandardScaler()

df[numerical_fts] = scaler.fit_transform(df[numerical_fts])  
# Save scaler along with feature names
scaler_info = {"scaler": scaler, "features": numerical_fts}

with open("scaler.pkl", "wb") as file:
    pickle.dump(scaler_info, file)

In [25]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,-1.277445,0,1,0,0,2,0,0,0,0,0,1,2,-1.160323,-0.994242,0
1,1,0,0,0,0.066327,1,0,0,2,0,2,0,0,0,1,0,3,-0.259629,-0.173244,0
2,1,0,0,0,-1.236724,1,0,0,2,2,0,0,0,0,0,1,3,-0.36266,-0.959674,1
3,1,0,0,0,0.514251,0,1,0,2,0,2,2,0,0,1,0,0,-0.746535,-0.194766,0
4,0,0,0,0,-1.236724,1,0,1,0,0,0,0,0,0,0,1,2,0.197365,-0.94047,1


Model training 

In [27]:
X = df.drop(columns=['Churn'], axis=1)
y = df['Churn']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
model = LogisticRegression(max_iter=5000)  
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8176011355571328

Confusion Matrix:
 [[935 101]
 [156 217]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      1036
           1       0.68      0.58      0.63       373

    accuracy                           0.82      1409
   macro avg       0.77      0.74      0.75      1409
weighted avg       0.81      0.82      0.81      1409



In [32]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Confusion Matrix:
 [[949  87]
 [197 176]]

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87      1036
           1       0.67      0.47      0.55       373

    accuracy                           0.80      1409
   macro avg       0.75      0.69      0.71      1409
weighted avg       0.79      0.80      0.79      1409



In [46]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, eval_metric="logloss")

# Train the model
xgb_model.fit(X_train, y_train)

# Predict on test data
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

Accuracy: 0.7615330021291696
XGBoost Confusion Matrix:
 [[824 212]
 [124 249]]

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.80      0.83      1036
           1       0.54      0.67      0.60       373

    accuracy                           0.76      1409
   macro avg       0.70      0.73      0.71      1409
weighted avg       0.78      0.76      0.77      1409



In [36]:
# Ensure the model is trained before this step
explainer = shap.TreeExplainer(xgb_model)  # Replace 'xgb_model' with your trained XGBoost model
shap_values = explainer.shap_values(X_train)  # Extract SHAP values

# Convert SHAP values to feature importance
shap_importance = pd.DataFrame({
    "Feature": X_train.columns, 
    "SHAP Value": np.abs(shap_values).mean(axis=0)  # Take absolute mean SHAP values
}).sort_values(by="SHAP Value", ascending=True)

print(shap_importance.head(15))  # View least important features

# Set threshold (e.g., remove bottom 5 features)
num_features_to_remove = 10
low_importance_features = shap_importance["Feature"].iloc[:num_features_to_remove].tolist()

# Drop from dataset
X_train_2 = X_train.drop(columns=low_importance_features)
X_test_2 = X_test.drop(columns=low_importance_features)

print(f"Removed features: {low_importance_features}")

             Feature  SHAP Value
5       PhoneService    0.023323
10  DeviceProtection    0.050359
1      SeniorCitizen    0.066225
3         Dependents    0.072560
2            Partner    0.073873
12       StreamingTV    0.080528
0             gender    0.082669
13   StreamingMovies    0.113290
6      MultipleLines    0.129245
9       OnlineBackup    0.136952
7    InternetService    0.156553
15  PaperlessBilling    0.184305
11       TechSupport    0.216565
16     PaymentMethod    0.235858
8     OnlineSecurity    0.319130
Removed features: ['PhoneService', 'DeviceProtection', 'SeniorCitizen', 'Dependents', 'Partner', 'StreamingTV', 'gender', 'StreamingMovies', 'MultipleLines', 'OnlineBackup']


In [47]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_model = XGBClassifier(scale_pos_weight=scale_pos_weight, eval_metric="logloss")

# Train the model
xgb_model.fit(X_train_2, y_train)

# Predict on test data
y_pred_xgb = xgb_model.predict(X_test_2)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("XGBoost Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

Accuracy: 0.7629524485450674
XGBoost Confusion Matrix:
 [[810 226]
 [108 265]]

XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.78      0.83      1036
           1       0.54      0.71      0.61       373

    accuracy                           0.76      1409
   macro avg       0.71      0.75      0.72      1409
weighted avg       0.79      0.76      0.77      1409



In [48]:
X_train_2.columns

Index(['tenure', 'InternetService', 'OnlineSecurity', 'TechSupport',
       'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges'],
      dtype='object')

better with default params

LOGISTIC REGG IS BETTER THAN RF AND XGB

NEURAL NETWORK

In [55]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])


# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
history = model.fit(X_train, y_train, epochs=50, batch_size=40, class_weight=class_weight_dict, validation_data=(X_test, y_test))

# Evaluate Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
f1 = f1_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6389 - loss: 0.6930 - val_accuracy: 0.7977 - val_loss: 0.4541
Epoch 2/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step - accuracy: 0.7018 - loss: 0.5805 - val_accuracy: 0.7715 - val_loss: 0.4538
Epoch 3/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 800us/step - accuracy: 0.7154 - loss: 0.5495 - val_accuracy: 0.7644 - val_loss: 0.4631
Epoch 4/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 805us/step - accuracy: 0.7451 - loss: 0.5121 - val_accuracy: 0.7544 - val_loss: 0.4780
Epoch 5/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 792us/step - accuracy: 0.7256 - loss: 0.5178 - val_accuracy: 0.7630 - val_loss: 0.4782
Epoch 6/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 794us/step - accuracy: 0.7348 - loss: 0.5031 - val_accuracy: 0.7530 - val_loss: 0.4844
Epoch 7/50
[1m141/141[0m [

applying smote and then passing

In [59]:
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Make minority class 50% of majority
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [60]:
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_smote.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])


# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Model
history = model.fit(X_train_smote, y_train_smote, epochs=50, batch_size=40, validation_data=(X_test, y_test))

# Evaluate Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
f1 = f1_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6752 - loss: 0.6591 - val_accuracy: 0.7828 - val_loss: 0.4604
Epoch 2/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 764us/step - accuracy: 0.7538 - loss: 0.5079 - val_accuracy: 0.8013 - val_loss: 0.4231
Epoch 3/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 763us/step - accuracy: 0.7556 - loss: 0.4925 - val_accuracy: 0.8006 - val_loss: 0.4218
Epoch 4/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step - accuracy: 0.7658 - loss: 0.4805 - val_accuracy: 0.8055 - val_loss: 0.4214
Epoch 5/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 760us/step - accuracy: 0.7615 - loss: 0.4679 - val_accuracy: 0.7984 - val_loss: 0.4235
Epoch 6/50
[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 801us/step - accuracy: 0.7705 - loss: 0.4628 - val_accuracy: 0.8055 - val_loss: 0.4111
Epoch 7/50
[1m156/156[0m [

In [82]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
# Compile Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

# Train Model
history = model.fit(X_train, y_train, epochs=50, batch_size=40, class_weight=class_weight_dict, validation_data=(X_test, y_test))

# Evaluate Model
y_pred = (model.predict(X_test) > 0.5).astype(int)
f1 = f1_score(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
model.save("models/nn_model.h5")  # Save Neural Network

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - AUC: 0.6977 - loss: 0.7346 - val_AUC: 0.8508 - val_loss: 0.4983
Epoch 2/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 841us/step - AUC: 0.8033 - loss: 0.5657 - val_AUC: 0.8486 - val_loss: 0.4925
Epoch 3/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 823us/step - AUC: 0.7945 - loss: 0.5547 - val_AUC: 0.8501 - val_loss: 0.4977
Epoch 4/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 821us/step - AUC: 0.8210 - loss: 0.5167 - val_AUC: 0.8503 - val_loss: 0.5016
Epoch 5/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 834us/step - AUC: 0.8278 - loss: 0.5087 - val_AUC: 0.8555 - val_loss: 0.4909
Epoch 6/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 827us/step - AUC: 0.8327 - loss: 0.5041 - val_AUC: 0.8536 - val_loss: 0.4900
Epoch 7/50
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 807us/step -



F1 Score: 0.6362


FINAL MODEL

In [83]:
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

# Train XGBoost Model
xgb_clf = xgb.XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, scale_pos_weight=5, random_state=42)
xgb_clf.fit(X_train, y_train)

# Convert Neural Network Predictions to Binary
y_pred_nn = (model.predict(X_test).ravel() > 0.5).astype(int)  # Convert to 1D array

# Convert XGBoost Predictions to Binary
y_pred_xgb = xgb_clf.predict(X_test).astype(int)  # XGBoost outputs already 1D

# Ensemble (Average of Predictions)
y_pred_ensemble = (y_pred_nn + y_pred_xgb) / 2
y_pred_ensemble = (y_pred_ensemble > 0.5).astype(int)  # Final binary conversion

# Evaluate F1-Score
f1 = f1_score(y_test, y_pred_ensemble)
print(f"F1 Score (Ensemble): {f1:.4f}")
xgb_clf.save_model("models/xgb_model.json")

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 497us/step
F1 Score (Ensemble): 0.6522


In [73]:
accuracy_score(y_test, y_pred_ensemble)

0.7615330021291696

In [84]:
import shutil

shutil.move("encoder.pkl", "models/encoder.pkl")
shutil.move("scaler.pkl", "models/scaler.pkl")

'models/scaler.pkl'