In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import pickle

In [2]:
## Load the dataset
data=pd.read_csv("Churn_Modelling.csv")
print(data.shape)
data.head(3)

(10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [3]:
## Preprocess the data
### Drop irrelevant columns
cust_ids = data['CustomerId']
data=data.drop(['RowNumber','CustomerId','Surname'],axis=1)
data.head(3)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1


In [4]:
data['Gender'].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [5]:
## Encode categorical variables
label_encoder_gender=LabelEncoder()
data['gender_male']= (data['Gender'] == 'Male').astype(int) #label_encoder_gender.fit_transform(data['Gender'])
del data['Gender']
data.head(3)

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,gender_male
0,619,France,42,2,0.0,1,1,1,101348.88,1,0
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0,0
2,502,France,42,8,159660.8,3,1,0,113931.57,1,0


In [6]:
## DiVide the dataset into indepent and dependent features
X=data.drop('Exited',axis=1)
y=data['Exited']

## Split the data in training and tetsing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [7]:
## Onehot encode 'Geography
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geo=OneHotEncoder()
geo_encoder_tr=onehot_encoder_geo.fit_transform(X_train[['Geography']]).toarray()
geo_encoder_te =onehot_encoder_geo.transform(X_test[['Geography']]).toarray()

In [8]:
onehot_encoder_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [9]:
geo_encoded_tr=pd.DataFrame(geo_encoder_tr,
columns=onehot_encoder_geo.get_feature_names_out(['Geography']), index=X_train.index)

geo_encoded_te=pd.DataFrame(geo_encoder_te,
columns=onehot_encoder_geo.get_feature_names_out(['Geography']), index=X_test.index)

In [10]:
data[(data['CreditScore'] == 686)&(data['Age'] == 32)]

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,gender_male
9254,686,France,32,6,0.0,2,1,1,179093.26,0,1


In [11]:
## Combine one hot encoder columns with the original data
X_train=pd.concat([X_train.drop('Geography',axis=1),geo_encoded_tr],axis=1)
# data.drop('Gender',axis=1, inplace=True)

## Combine one hot encoder columns with the original data
X_test=pd.concat([X_test.drop('Geography',axis=1),geo_encoded_te],axis=1)
# X_test.drop('Gender',axis=1, inplace=True)

display(X_train.head())
display(X_test.head())

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,gender_male,Geography_France,Geography_Germany,Geography_Spain
9254,686,32,6,0.0,2,1,1,179093.26,1,1.0,0.0,0.0
1561,632,42,4,119624.6,2,1,1,195978.86,1,0.0,1.0,0.0
1670,559,24,3,114739.92,1,1,0,85891.02,1,0.0,0.0,1.0
6087,561,27,9,135637.0,1,1,0,153080.4,0,1.0,0.0,0.0
6669,517,56,9,142147.32,1,0,0,39488.04,1,1.0,0.0,0.0


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,gender_male,Geography_France,Geography_Germany,Geography_Spain
6252,596,32,3,96709.07,2,0,0,41788.37,1,0.0,1.0,0.0
4684,623,43,1,0.0,2,1,1,146379.3,1,1.0,0.0,0.0
1731,601,44,4,0.0,2,1,0,58561.31,0,0.0,0.0,1.0
4742,506,59,8,119152.1,2,1,1,170679.74,1,0.0,1.0,0.0
4521,560,27,7,124995.98,1,1,1,114669.79,0,0.0,0.0,1.0


In [12]:
## Save the encoders and sscaler
# with open('label_encoder_gender.pkl','wb') as file:
#     pickle.dump(label_encoder_gender,file)

with open('onehot_encoder_geo.pkl','wb') as file:
    pickle.dump(onehot_encoder_geo,file)


In [13]:
data.head(2)

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,gender_male
0,619,France,42,2,0.0,1,1,1,101348.88,1,0
1,608,Spain,41,1,83807.86,1,0,1,112542.58,0,0


In [14]:
## Scale these features
scaler=StandardScaler()
X_train=pd.DataFrame( scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test=pd.DataFrame( scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
# X_test=scaler.transform(X_test)

In [16]:
X_train.iloc[0]

CreditScore          0.356500
Age                 -0.655786
Tenure               0.345680
Balance             -1.218471
NumOfProducts        0.808436
HasCrCard            0.649203
IsActiveMember       0.974817
EstimatedSalary      1.367670
gender_male          0.913248
Geography_France     1.001501
Geography_Germany   -0.579467
Geography_Spain     -0.576388
Name: 9254, dtype: float64

In [17]:
with open('scaler.pkl','wb') as file:
    pickle.dump(scaler,file)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8000, 12), (2000, 12), (8000,), (2000,))

In [22]:
display(X_train.head(3))
display(X_test.head(3))

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,gender_male,Geography_France,Geography_Germany,Geography_Spain
9254,0.3565,-0.655786,0.34568,-1.218471,0.808436,0.649203,0.974817,1.36767,0.913248,1.001501,-0.579467,-0.576388
1561,-0.203898,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,1.661254,0.913248,-0.998501,1.725723,-0.576388
1670,-0.961472,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,-0.252807,0.913248,-0.998501,-0.579467,1.734942


Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,gender_male,Geography_France,Geography_Germany,Geography_Spain
6252,-0.577496,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-1.019605,0.913248,-0.998501,1.725723,-0.576388
4684,-0.297297,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,0.798883,0.913248,1.001501,-0.579467,-0.576388
1731,-0.525607,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.72798,-1.094993,-0.998501,-0.579467,1.734942


### ANN Implementation

In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [24]:
(X_train.shape[1],)

(12,)

In [25]:
## Build Our ANN Model
model=Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)), ## HL1 Connected wwith input layer
    Dense(32,activation='relu'), ## HL2
    Dense(1,activation='sigmoid')  ## output layer
]

)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [26]:
model.summary()

In [27]:
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss=tensorflow.keras.losses.BinaryCrossentropy()
loss

<LossFunctionWrapper(<function binary_crossentropy at 0x000002DC04A20EE0>, kwargs={'from_logits': False, 'label_smoothing': 0.0, 'axis': -1})>

In [28]:
## compile the model
model.compile(optimizer=opt,loss="binary_crossentropy",metrics=['accuracy'])

In [29]:
## Set up the Tensorboard
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard

log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback=TensorBoard(log_dir=log_dir,histogram_freq=1)

In [30]:
## Set up Early Stopping
early_stopping_callback=EarlyStopping(monitor='val_loss',patience=10,restore_best_weights=True)

In [31]:
### Train the model
history=model.fit(
    X_train,y_train,validation_data=(X_test,y_test),epochs=100,
    callbacks=[tensorflow_callback,early_stopping_callback]
)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8010 - loss: 0.4429 - val_accuracy: 0.8550 - val_loss: 0.3558
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8581 - loss: 0.3497 - val_accuracy: 0.8625 - val_loss: 0.3436
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8558 - loss: 0.3460 - val_accuracy: 0.8610 - val_loss: 0.3412
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8622 - loss: 0.3327 - val_accuracy: 0.8595 - val_loss: 0.3454
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8598 - loss: 0.3396 - val_accuracy: 0.8590 - val_loss: 0.3579
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8623 - loss: 0.3282 - val_accuracy: 0.8620 - val_loss: 0.3424
Epoch 7/100
[1m250/25

In [32]:
model.save('model.h5')



In [33]:
## Load Tensorboard Extension
%load_ext tensorboard

In [34]:
%tensorboard --logdir logs/fit

In [None]:
### Load the pickle file
