What's new:
1. Use of SMOTE to synthetically produce new samples of the minority class.
2. Learnt about the stratify parameter to ensure equal distribution of classes in the training and test sets.
3. Use of a more complex model (more neurons per layer, more layers in the model and a different learning rate) to try and achieve better accuracy.

In [79]:
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [80]:
data = pd.read_csv('customer_churn.csv')

In [81]:
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [82]:
data.TotalCharges.isnull().sum()

0

In [83]:
data.TotalCharges.iloc[488]

' '

In [84]:
data['TotalCharges'].dtype

dtype('O')

In [85]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce').astype('float64')

In [86]:
data.TotalCharges.isnull().sum()

11

In [87]:
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

In [88]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2265.000258
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,402.225
50%,0.0,29.0,70.35,1400.55
75%,0.0,55.0,89.85,3786.6
max,1.0,72.0,118.75,8684.8


In [89]:
columns_to_encode = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                     'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                     'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                     'PaperlessBilling', 'PaymentMethod', 'Churn']

label_encoders = {}

for column in columns_to_encode:
    label_encoder = tf.keras.layers.StringLookup(
        vocabulary=data[column].unique(), mask_token=None, num_oov_indices=0
    )
    data[column] = label_encoder(data[column])
    label_encoders[column] = label_encoder

print(data.head())

   customerID  gender  SeniorCitizen  Partner  Dependents  tenure  \
0  7590-VHVEG       0              0        0           0       1   
1  5575-GNVDE       1              0        1           0      34   
2  3668-QPYBK       1              0        1           0       2   
3  7795-CFOCW       1              0        1           0      45   
4  9237-HQITU       0              0        1           0       2   

   PhoneService  MultipleLines  InternetService  OnlineSecurity  ...  \
0             0              0                0               0  ...   
1             1              1                0               1  ...   
2             1              1                0               1  ...   
3             0              0                0               1  ...   
4             1              1                1               0  ...   

   DeviceProtection  TechSupport  StreamingTV  StreamingMovies  Contract  \
0                 0            0            0                0         0   


In [90]:
data = data.drop(columns = ['customerID'])

In [91]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,29.85,29.85,0
1,1,0,1,0,34,1,1,0,1,1,1,0,0,0,1,1,1,56.95,1889.5,0
2,1,0,1,0,2,1,1,0,1,0,0,0,0,0,0,0,1,53.85,108.15,1
3,1,0,1,0,45,0,0,0,1,1,1,1,0,0,1,1,2,42.3,1840.75,0
4,0,0,1,0,2,1,1,1,0,1,0,0,0,0,0,0,0,70.7,151.65,1


In [92]:
data.shape

(7043, 20)

In [93]:
import numpy as np

x = data.iloc[:, :-1]
y = data.iloc[:, -1]

x.shape, y.shape

((7043, 19), (7043,))

In [94]:
from imblearn.over_sampling import SMOTE
!pip install --upgrade scikit-learn imbalanced-learn

smote = SMOTE(sampling_strategy = 'minority')
x_sm, y_sm = smote.fit_resample(x,y)
y_sm.value_counts()



0    5174
1    5174
Name: Churn, dtype: int64

In [95]:
x_sm.shape, y_sm.shape

((10348, 19), (10348,))

In [96]:
y_sm.value_counts()

0    5174
1    5174
Name: Churn, dtype: int64

In [97]:
y_sm.unique()

array([0, 1])

Test-Train Split

In [98]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size=0.2, random_state=42, stratify = y_sm)

In [99]:
y_train.value_counts(),
y_test.value_counts()

0    1035
1    1035
Name: Churn, dtype: int64

Model Building

In [100]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

model = Sequential ([
    Dense(19),
    Dense(38, activation = 'relu'),
    Dense(25, activation = 'relu'),
    Dense(19, activation = 'relu'),
    Dense(10, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

In [105]:
import keras

model.compile(
    loss = keras.losses.BinaryCrossentropy(),
    optimizer = keras.optimizers.Adam(lr = 0.0001),
    metrics = 'accuracy',
)



In [102]:
x_train.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [106]:
model.fit(x_train, y_train, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x781431e74460>

In [107]:
model.fit(x_test, y_test, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7814316258d0>