In [1]:
pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install jupyter-tensorboard

Collecting jupyter-tensorboard
  Using cached jupyter_tensorboard-0.2.0.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting notebook>=5.0 (from jupyter-tensorboard)
  Using cached notebook-7.3.2-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-server<3,>=2.4.0 (from notebook>=5.0->jupyter-tensorboard)
  Using cached jupyter_server-2.15.0-py3-none-any.whl.metadata (8.4 kB)
Collecting jupyterlab-server<3,>=2.27.1 (from notebook>=5.0->jupyter-tensorboard)
  Using cached jupyterlab_server-2.27.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jupyterlab<4.4,>=4.3.4 (from notebook>=5.0->jupyter-tensorboard)
  Using cached jupyterlab-4.3.5-py3-none-any.whl.metadata (16 kB)
Collecting notebook-shim<0.3,>=0.2 (from notebook>=5.0->jupyter-tensorboard)
  Using cached notebook_shim-0.2.4-py3-none-any.whl.metadata (4.0 kB)
Collecting anyio>=3.1.0 (from jupyter-server<3,>=2.4.0->notebook>=5.0->jupyter-tensorboard)
  Using cached anyio-4.8.0-py3-none-any.whl.metadata (4.

In [3]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
# import necessary libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score

In [5]:
# load data
data = pd.read_csv('Bank Customer Churn Prediction.csv')
df = data.copy()
df.head()

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# basic data details
print('dimension: \n',df.shape)
print('data integrity: \n', df.info())
print('columns of df are: \n', df.columns.to_list())
print('number of null values: \n', df.isna().sum())
print('number of duplicate records: \n', df.duplicated().sum())

dimension: 
 (10000, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       10000 non-null  int64  
 1   credit_score      10000 non-null  int64  
 2   country           10000 non-null  object 
 3   gender            10000 non-null  object 
 4   age               10000 non-null  int64  
 5   tenure            10000 non-null  int64  
 6   balance           10000 non-null  float64
 7   products_number   10000 non-null  int64  
 8   credit_card       10000 non-null  int64  
 9   active_member     10000 non-null  int64  
 10  estimated_salary  10000 non-null  float64
 11  churn             10000 non-null  int64  
dtypes: float64(2), int64(8), object(2)
memory usage: 937.6+ KB
data integrity: 
 None
columns of df are: 
 ['customer_id', 'credit_score', 'country', 'gender', 'age', 'tenure', 'balance', 'products_number', '

# Data Preprocessing and Feature Engineering

In [7]:
# Removing CustomerId as it has no role in model training
df.drop(['customer_id'], axis=1, inplace=True)

In [8]:
# cross check
print(df.shape)
print(df.columns.to_list())

(10000, 11)
['credit_score', 'country', 'gender', 'age', 'tenure', 'balance', 'products_number', 'credit_card', 'active_member', 'estimated_salary', 'churn']


In [9]:
# encoding categorical values
cat_cols = df.select_dtypes(include=['object']).columns
print(df[cat_cols].nunique())

country    3
gender     2
dtype: int64


In [10]:
# we will use label encoder for gender and onhehotencoder for country
label_encoder_gender = LabelEncoder()
df['gender']=label_encoder_gender.fit_transform(df['gender'])
df

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [11]:
# We will use onehot encoder for country
onehot_encoder_country = OneHotEncoder(sparse_output=False)
encoded_country = onehot_encoder_country.fit_transform(df[['country']])
encoded_df = pd.DataFrame(encoded_country, columns=onehot_encoder_country.get_feature_names_out(['country']))
df = pd.concat([df.drop(columns=['country'], axis=1), encoded_df], axis=1)
df

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,country_France,country_Germany,country_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [12]:
# Saving encoder for future use
import pickle

with open('label_encoder_gender.pkl','wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('onehot_encoder_country.pkl','wb') as file:
    pickle.dump(onehot_encoder_country, file)

In [13]:
# Separate target and feature
X = df.drop('churn', axis=1)
y = df['churn']

In [14]:
# cross check
print(X.shape)
print(y.shape)

(10000, 12)
(10000,)


In [15]:
# dividing into training an testing data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [16]:
# scaling these features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [17]:
# cross check
X_train

array([[ 0.35649971,  0.91324755, -0.6557859 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [-0.20389777,  0.91324755,  0.29493847, ..., -0.99850112,
         1.72572313, -0.57638802],
       [-0.96147213,  0.91324755, -1.41636539, ..., -0.99850112,
        -0.57946723,  1.73494238],
       ...,
       [ 0.86500853, -1.09499335, -0.08535128, ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.15932282,  0.91324755,  0.3900109 , ...,  1.00150113,
        -0.57946723, -0.57638802],
       [ 0.47065475,  0.91324755,  1.15059039, ..., -0.99850112,
         1.72572313, -0.57638802]])

In [18]:
# saving this scaler
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler, file)

# ANN Implementation

In [19]:
# importing necessary libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import datetime


2025-02-20 19:48:15.566466: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 19:48:15.609767: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-20 19:48:15.609814: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-20 19:48:15.610932: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-20 19:48:15.624996: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 19:48:15.628860: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [20]:
# build our ANN model
model = Sequential([
    Dense(64, activation = 'relu', input_shape=(X_train.shape[1],)), # HL1 connected with input
    Dense(32, activation = 'relu'), # HL2
    Dense(1, activation = 'sigmoid') # output layer
]
    
)

In [21]:
# cross check
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                832       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 2945 (11.50 KB)
Trainable params: 2945 (11.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# setting parameters for compilation
import tensorflow
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy(from_logits=False, label_smoothing=0.0)

In [30]:
# we have toatal 2945 paramters to train or (weights, biases to train)
# compile model
model.compile(optimizer=opt,loss="binary_crossentropy", metrics=['accuracy'])

In [31]:
# Setup the TensorBoard
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir,histogram_freq=1)


In [32]:
# Setup Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [26]:
# Train the model
history = model.fit(
    X_train,y_train, validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [33]:
# save model
model.save('model.h5')

  saving_api.save_model(


In [28]:
# load Tensorboard Extension
%load_ext tensorboard

In [34]:
%tensorboard --logdir logs/fit20250220-130304

Reusing TensorBoard on port 6006 (pid 8595), started 0:03:58 ago. (Use '!kill 8595' to kill it.)