In [57]:
import tensorflow

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle


In [14]:
##Load the dataset
data= pd.read_csv('Churn_Modelling.csv')
data.head() 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [35]:
## Preprocessing the Data
## Dropping the columns that are irrelevent Eg: Rownumber, customerID, Surname

preprocessed_data= data.drop(['RowNumber','CustomerId','Surname'], axis=1) #axis= 1 means remove columns
preprocessed_data

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [36]:
##Encoding Categorical data
## We are encoding the 'Gender' column

gender_encoding= LabelEncoder()
preprocessed_data['Gender']= gender_encoding.fit_transform(preprocessed_data['Gender']) # Male and female become '0' and '1'

preprocessed_data


Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.80,3,1,0,113931.57,1
3,699,France,0,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,1,39,5,0.00,2,1,0,96270.64,0
9996,516,France,1,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,0,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,1,42,3,75075.31,2,1,0,92888.52,1


In [37]:
# Onehot encoding
# We need to encode different values in the Geography column 
# Since we have 3 different values like France, Spain, Germany the encoding cannot be like 
# France= 0, Spain=1, Germany=2 because germany will have higher degree of influence than Spain or France
# because of higher exponent value so we use onehot encoding.

from sklearn.preprocessing import OneHotEncoder

geo_encoder= OneHotEncoder()

geo_column_output= geo_encoder.fit_transform(preprocessed_data[['Geography']])

geo_column_output





<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [40]:
geo_out= geo_encoder.get_feature_names_out(['Geography'])
geo_out


AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [42]:
new_geo_columns=pd.DataFrame(geo_column_output.toarray(), columns=geo_out)
new_geo_columns

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [43]:
## Combine all the one hot encoded colums

preprocessed_data_final= pd.concat([preprocessed_data.drop('Geography', axis=1),new_geo_columns], axis=1)
preprocessed_data_final



Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,0,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,0,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,1,0.0,1.0,0.0


In [45]:
#Save the scalers and encoders as pickle file

with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(gender_encoding,file)

with open('onehot_encoder_geography.pkl', 'wb') as file:
    pickle.dump(geo_encoder,file)

In [46]:
#Divide the data set into dependent and independent features

X= preprocessed_data_final.drop('Exited', axis=1)
y= preprocessed_data_final['Exited']

##Split the data set into testing and training set 

x_train, x_test, y_train, y_test= train_test_split(X,y, test_size= 0.2, random_state=42)

#Scale the features

scalar= StandardScaler()

x_train= scalar.fit_transform(x_train)
x_test= scalar.transform(x_test)

In [48]:
with open('scalar.pkl','wb') as file:
    pickle.dump(scalar, file)

# ANN Implementation


In [50]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

import datetime


In [52]:
x_train.shape[1]

12

In [55]:
#Build an ANN model

model= Sequential([
        Dense(64, activation='relu', input_shape= (x_train.shape[1],)), ## Hidden layer 1- 64 neurons, with activation fuction 'relu', 
        #and 1 diamentional input represented as a tuple with the entry after the comma as blank

        Dense(32, activation='relu'), ## Hidden layer 2- 32 neurons, with activation function 'relu', since the model is a 
        #sequential model we don't have to bother about the input shape. 

        Dense(1, activation='sigmoid'), ## Output layer- 1 neuron, with activation fuction 'sigmoid' 
        #sigmoid activation funtion gives an output between 0 and 1. Since we want to predict a 'yes' or 'No' condition we are using sigmoid 

    ])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [56]:
model.summary()

In [58]:

# model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

# You can compile the model by passing the arguments like the above line or we can define 
# optimizers (with variable learning rate) and loss

opt= tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss= tensorflow.keras.losses.BinaryCrossentropy()

model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

In [71]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir= "logs/fit_" + str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

tensorflow_callback= TensorBoard(log_dir=log_dir, histogram_freq=1)

In [74]:
#Early stopping
early_stopping_callback= EarlyStopping(monitor='val_loss', patience= 10, restore_best_weights= True)

# While training the model after 'n' number of epoches the quality/accuracy of our model starts to degrade
# to stop training further we use early stopping.



In [75]:
## Train the model

history= model.fit(
    x_train, y_train, validation_data=(x_test, y_test), epochs= 100,
    callbacks= [tensorflow_callback, early_stopping_callback]

)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8617 - loss: 0.3306 - val_accuracy: 0.8580 - val_loss: 0.3476
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8564 - loss: 0.3446 - val_accuracy: 0.8635 - val_loss: 0.3442
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8657 - loss: 0.3227 - val_accuracy: 0.8580 - val_loss: 0.3406
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8596 - loss: 0.3353 - val_accuracy: 0.8585 - val_loss: 0.3452
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8591 - loss: 0.3406 - val_accuracy: 0.8575 - val_loss: 0.3416
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8744 - loss: 0.3153 - val_accuracy: 0.8575 - val_loss: 0.3536
Epoch 7/100
[1m250/25

In [77]:
model.save('model.h5')



In [83]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [91]:
%tensorboard --logdir logs/fit_20250626-231030

ERROR: Failed to launch TensorBoard (exited with -9).