In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [None]:
data = pd.read_csv("Churn_Modelling.csv")
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15615795,Johnson,416,France,Female,29,4,225721.23,3,0,1,125534.39,0
1,2,15600860,Rodriguez,353,Spain,Female,40,10,62202.42,2,0,0,91167.84,0
2,3,15676820,Smith,573,Spain,Male,32,9,234936.34,4,0,1,97267.6,0
3,4,15654886,Miller,350,Germany,Female,29,5,160467.05,1,0,1,158691.64,0
4,5,15606265,Brown,639,Germany,Female,21,7,112608.52,4,0,0,52289.16,0


In [None]:
# preprocess the data
# Drop irrelevant columns
data = data.drop(['RowNumber','CustomerId','Surname'],axis=1)
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,416,France,Female,29,4,225721.23,3,0,1,125534.39,0
1,353,Spain,Female,40,10,62202.42,2,0,0,91167.84,0
2,573,Spain,Male,32,9,234936.34,4,0,1,97267.60,0
3,350,Germany,Female,29,5,160467.05,1,0,1,158691.64,0
4,639,Germany,Female,21,7,112608.52,4,0,0,52289.16,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,611,Spain,Male,36,8,162316.73,1,0,1,105025.67,0
9996,613,France,Male,91,2,241200.46,2,1,1,56559.80,1
9997,366,France,Male,22,4,33346.79,4,0,0,170028.93,0
9998,425,Germany,Female,87,6,7741.70,2,1,0,88058.42,0


In [50]:
# Encode categorical variables
label_encoder_gender=LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])
data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,416,France,0,29,4,225721.23,3,0,1,125534.39,0
1,353,Spain,0,40,10,62202.42,2,0,0,91167.84,0
2,573,Spain,1,32,9,234936.34,4,0,1,97267.60,0
3,350,Germany,0,29,5,160467.05,1,0,1,158691.64,0
4,639,Germany,0,21,7,112608.52,4,0,0,52289.16,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,611,Spain,1,36,8,162316.73,1,0,1,105025.67,0
9996,613,France,1,91,2,241200.46,2,1,1,56559.80,1
9997,366,France,1,22,4,33346.79,4,0,0,170028.93,0
9998,425,Germany,0,87,6,7741.70,2,1,0,88058.42,0


In [51]:
# one hot encode 'Geography'
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo=OneHotEncoder(handle_unknown='ignore',sparse_output=False)
geo_encoder=onehot_encoder_geo.fit_transform(data[['Geography']])
geo_encoder

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [52]:
onehot_encoder_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [53]:
print(type(geo_encoder))
print(geo_encoder.shape)

<class 'numpy.ndarray'>
(10000, 3)


In [54]:
geo_encoded_df = pd.DataFrame(
    geo_encoder,
    columns=onehot_encoder_geo.get_feature_names_out(['Geography']),
    index=data.index 
)

In [55]:
data = pd.concat([data, geo_encoded_df], axis=1)
data = data.drop('Geography', axis=1)


In [56]:
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,416,0,29,4,225721.23,3,0,1,125534.39,0,1.0,0.0,0.0
1,353,0,40,10,62202.42,2,0,0,91167.84,0,0.0,0.0,1.0
2,573,1,32,9,234936.34,4,0,1,97267.6,0,0.0,0.0,1.0
3,350,0,29,5,160467.05,1,0,1,158691.64,0,0.0,1.0,0.0
4,639,0,21,7,112608.52,4,0,0,52289.16,0,0.0,1.0,0.0


In [57]:
# save the encoders and sscaler
with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)

In [58]:
# divide the dataset into dependent and independent feature
X=data.drop('Exited',axis=1)
y=data['Exited']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# scale these feature
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [59]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array([0, 1]),
    y=y_train
)
class_weight_dict = {
    0: class_weights[0],
    1: class_weights[1]
}

In [60]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

ANN implementation

In [61]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [62]:
X_train.shape[1]

12

In [63]:
import tensorflow as tf
print(tf.__version__)


2.16.1


In [64]:
model = Sequential([
    Dense(128,activation='relu',input_shape=(X_train.shape[1],)),    
    Dense(64,activation='relu'),
    Dense(32,activation='relu'),
    Dense(1,activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [65]:
model.summary()

In [66]:
# compile the model
model.compile(optimizer="Adam",loss="binary_crossentropy",metrics=['accuracy'])

In [67]:
# set up the tensorboard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
log_dir="logs/fit" + datetime.datetime.now().strftime("%Y%m%d.%H%M%S")
tensorflow_callbacks = TensorBoard(log_dir=log_dir,histogram_freq=1)

In [68]:
# setup early stopping
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [69]:
model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weight_dict
)


Epoch 1/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5700 - loss: 0.6933 - val_accuracy: 0.4006 - val_loss: 0.7085
Epoch 2/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5302 - loss: 0.6867 - val_accuracy: 0.6506 - val_loss: 0.6697
Epoch 3/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5973 - loss: 0.6831 - val_accuracy: 0.5619 - val_loss: 0.6883
Epoch 4/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6172 - loss: 0.6792 - val_accuracy: 0.4831 - val_loss: 0.7045
Epoch 5/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5927 - loss: 0.6769 - val_accuracy: 0.6094 - val_loss: 0.6762
Epoch 6/50
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6111 - loss: 0.6740 - val_accuracy: 0.6350 - val_loss: 0.6625
Epoch 7/50
[1m200/200[0m 

<keras.src.callbacks.history.History at 0x16d8c9db2f0>

In [87]:
from sklearn.metrics import confusion_matrix, classification_report

y_pred = (model.predict(X_test) > 0.3).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[[1208  417]
 [ 280   95]]
              precision    recall  f1-score   support

           0       0.81      0.74      0.78      1625
           1       0.19      0.25      0.21       375

    accuracy                           0.65      2000
   macro avg       0.50      0.50      0.50      2000
weighted avg       0.69      0.65      0.67      2000



In [70]:
# train the model
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100,
                  callbacks=[tensorflow_callbacks,early_stopping_callback])

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.8571 - loss: 0.3780 - val_accuracy: 0.7260 - val_loss: 0.8463
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8676 - loss: 0.3397 - val_accuracy: 0.7530 - val_loss: 0.7948
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.8729 - loss: 0.3239 - val_accuracy: 0.7280 - val_loss: 0.8167
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8760 - loss: 0.3130 - val_accuracy: 0.7490 - val_loss: 0.7990
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8761 - loss: 0.3037 - val_accuracy: 0.7250 - val_loss: 0.7891
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8811 - loss: 0.2963 - val_accuracy: 0.7135 - val_loss: 0.8131
Epoch 7/100
[1m250/2

In [71]:
model.save('model.h5')



In [72]:
# Load TensorBoard Extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [73]:
%tensorboard --logdir logs/fit20260128.112547

Reusing TensorBoard on port 6007 (pid 18464), started 7:24:51 ago. (Use '!kill 18464' to kill it.)

In [74]:
import tensorflow as tf
from tensorflow.keras.models import load_model
import pickle
import pandas as pd
import numpy as np

In [75]:
# Load the trained model,scaler pickle,onehot
model=load_model('model.h5')

# load the encoder and scaler
with open('onehot_encoder_geo.pkl','rb') as file:
    label_encoder_geo= pickle.load(file)
with open('label_encoder_gender.pkl','rb') as file:
    label_encoder_gender= pickle.load(file)
with open('scaler.pkl','rb') as file:
    scaler= pickle.load(file)



In [76]:
# Example input data
input_data = {
    'CreditScore':600,
    'Geography':'France',
    'Gender':'Male',
    'Age':40,
    'Tenure':3,
    'Balance':60000,
    'NumOfProducts':2,
    'HasCrCard':1,
    'IsActiveMember':1,
    'EstimatedSalary':50000
}

In [77]:
# one-hot encode geography
geo_encoded = label_encoder_geo.transform([[input_data['Geography']]])
geo_encoded_df = pd.DataFrame(geo_encoded, columns=label_encoder_geo.get_feature_names_out(['Geography']))
geo_encoded_df



Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0


In [78]:
input_df = pd.DataFrame([input_data])
input_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,Male,40,3,60000,2,1,1,50000


In [79]:
input_df['Gender'] = label_encoder_gender.transform(input_df['Gender'])
input_df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,600,France,1,40,3,60000,2,1,1,50000


In [80]:
# concatination one hot encoded
input_df = pd.concat([input_df.drop("Geography",axis=1),geo_encoded_df],axis=1)
input_df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,600,1,40,3,60000,2,1,1,50000,1.0,0.0,0.0


In [81]:
# scaling the input data
input_scaled = scaler.transform(input_df)
input_scaled

array([[-0.00182487,  1.0307217 , -0.66015228, -0.62734571, -0.91832792,
        -0.42734674,  1.01435299,  1.01080841, -1.00723068,  1.00853643,
        -0.59140118, -0.5698444 ]])

In [82]:
# predict churn
prediction=model.predict(input_scaled)
prediction

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 198ms/step


array([[0.55506337]], dtype=float32)

In [83]:
prediction_proba = prediction[0][0]
prediction_proba

0.55506337

In [84]:
if prediction_proba > 0.5:
    print('The customer is likely to churn')
else:
    print('The customer is not likely to churn')

The customer is likely to churn
