In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import pickle

In [38]:
data = pd.read_csv(r"/Users/vinaykiran/Documents/personal_projects/Gen AI/Deep Learning for NLP/ANN_Classification/Churn_Modelling.csv")

In [39]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [40]:
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


In [41]:
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [42]:
label_encoder_gender = LabelEncoder()
data["Gender"] = label_encoder_gender.fit_transform(data["Gender"])

In [43]:
data.head(10)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0
5,645,Spain,1,44,8,113755.78,2,1,0,149756.71,1
6,822,France,1,50,7,0.0,2,1,1,10062.8,0
7,376,Germany,0,29,4,115046.74,4,1,0,119346.88,1
8,501,France,1,44,4,142051.07,2,0,1,74940.5,0
9,684,France,1,27,2,134603.88,1,1,1,71725.73,0


In [44]:
from sklearn.preprocessing import OneHotEncoder
onehot_encoder_geography = OneHotEncoder(sparse_output=False)
geo_encoder = onehot_encoder_geography.fit_transform(data[["Geography"]])

In [45]:
onehot_encoder_geography.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [46]:
geo_df = pd.DataFrame(geo_encoder, columns=onehot_encoder_geography.get_feature_names_out())
data = data.drop(['Geography'], axis=1)
data = pd.concat([data, geo_df], axis=1)

In [47]:
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)

with open('onehot_encoder_geography.pkl', 'wb') as file:
    pickle.dump(onehot_encoder_geography, file)

In [48]:
## Split the data into features and target variables
X = data.drop('Exited', axis=1)
y = data['Exited']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## ANN Implementation

In [50]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [51]:
## Build our ANN model
model = Sequential([
    Dense(64,activation='relu',input_shape=(X_train.shape[1],)), ## HL1 connected with input layer
    Dense(32,activation='relu'), ## HL2
    Dense(1,activation='sigmoid') ## Output Layer
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [52]:
model.summary()

In [53]:
## Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])


In [54]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [55]:
## set up Early Stopping
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [56]:
## Training the model
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=100,
                    callbacks=[early_stopping_callback, tensorboard_callback])

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7835 - loss: 0.5536 - val_accuracy: 0.6695 - val_loss: 1.1577
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7480 - loss: 1.3143 - val_accuracy: 0.7520 - val_loss: 1.6628
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7210 - loss: 3.4169 - val_accuracy: 0.8220 - val_loss: 1.4176
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7441 - loss: 3.5868 - val_accuracy: 0.7695 - val_loss: 3.1895
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7349 - loss: 5.6076 - val_accuracy: 0.7540 - val_loss: 4.6312
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7136 - loss: 10.8485 - val_accuracy: 0.8180 - val_loss: 7.2332
Epoch 7/100
[1m250/2

In [57]:
test_df = pd.DataFrame(X_test, columns=X.columns)
# test_df['Actual_Exited'] = y_test.values

In [58]:
test_df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,-0.577496,0.913248,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-1.019605,-0.998501,1.725723,-0.576388
1,-0.297297,0.913248,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,0.798883,1.001501,-0.579467,-0.576388
2,-0.525607,-1.094993,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.72798,-0.998501,-0.579467,1.734942
3,-1.511492,0.913248,1.91117,1.039728,0.689272,0.808436,0.649203,0.974817,1.221387,-0.998501,1.725723,-0.576388
4,-0.951094,-1.094993,-1.131148,0.692704,0.782839,-0.916688,0.649203,0.974817,0.24756,-0.998501,-0.579467,1.734942


In [59]:
test_df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        2000 non-null   float64
 1   Gender             2000 non-null   float64
 2   Age                2000 non-null   float64
 3   Tenure             2000 non-null   float64
 4   Balance            2000 non-null   float64
 5   NumOfProducts      2000 non-null   float64
 6   HasCrCard          2000 non-null   float64
 7   IsActiveMember     2000 non-null   float64
 8   EstimatedSalary    2000 non-null   float64
 9   Geography_France   2000 non-null   float64
 10  Geography_Germany  2000 non-null   float64
 11  Geography_Spain    2000 non-null   float64
dtypes: float64(12)
memory usage: 187.6 KB


In [60]:
test_df.to_csv('test_data.csv',index=False)

In [61]:
model.save('model.h5')



In [62]:
## Load Tensorboard Exctension
%load_ext tensorboard
%tensorboard --logdir logs/fit/ 

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 1396), started 0:14:20 ago. (Use '!kill 1396' to kill it.)

In [63]:
prediction = model.predict(X_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [64]:
prediction

array([[0.4029246 ],
       [0.01131347],
       [0.29118595],
       ...,
       [0.9965444 ],
       [0.95589757],
       [0.9380169 ]], dtype=float32)

In [65]:
out_df = pd.DataFrame(X_test, columns=X.columns)
out_df['Predicted_Exited_proba'] = prediction
out_df['Predicted_Exited'] = [1 if val>0.5 else 0 for val in prediction]
out_df['summary'] = out_df.apply(lambda x: 'Will Exit' if x['Predicted_Exited']==1 else 'Will Not Exit', axis=1)

In [66]:
out_df.head(15)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Predicted_Exited_proba,Predicted_Exited,summary
0,-0.577496,0.913248,-0.655786,-0.695393,0.329937,0.808436,-1.540351,-1.025834,-1.019605,-0.998501,1.725723,-0.576388,0.402925,0,Will Not Exit
1,-0.297297,0.913248,0.390011,-1.389442,-1.218471,0.808436,0.649203,0.974817,0.798883,1.001501,-0.579467,-0.576388,0.011313,0,Will Not Exit
2,-0.525607,-1.094993,0.485083,-0.348369,-1.218471,0.808436,0.649203,-1.025834,-0.72798,-0.998501,-0.579467,1.734942,0.291186,0,Will Not Exit
3,-1.511492,0.913248,1.91117,1.039728,0.689272,0.808436,0.649203,0.974817,1.221387,-0.998501,1.725723,-0.576388,0.201726,0,Will Not Exit
4,-0.951094,-1.094993,-1.131148,0.692704,0.782839,-0.916688,0.649203,0.974817,0.24756,-0.998501,-0.579467,1.734942,0.143155,0,Will Not Exit
5,1.435784,0.913248,-0.180424,1.039728,-1.218471,0.808436,0.649203,0.974817,0.851723,-0.998501,-0.579467,1.734942,0.00657,0,Will Not Exit
6,-2.2068,-1.094993,-0.655786,-0.695393,1.005482,-0.916688,0.649203,-1.025834,-0.430239,-0.998501,-0.579467,1.734942,0.711479,1,Will Exit
7,-0.567118,-1.094993,-1.60651,0.34568,0.407103,-0.916688,0.649203,-1.025834,-0.519901,-0.998501,1.725723,-0.576388,0.996611,1,Will Exit
8,0.273478,-1.094993,0.104794,-0.348369,0.603488,-0.916688,0.649203,-1.025834,-1.457221,-0.998501,-0.579467,1.734942,0.899681,1,Will Exit
9,-1.947357,-1.094993,0.294938,-0.695393,0.153339,-0.916688,0.649203,0.974817,1.107069,-0.998501,1.725723,-0.576388,0.99091,1,Will Exit


In [67]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, out_df['Predicted_Exited']))
print(confusion_matrix(y_test, out_df['Predicted_Exited']))

              precision    recall  f1-score   support

           0       0.89      0.67      0.77      1607
           1       0.33      0.65      0.44       393

    accuracy                           0.67      2000
   macro avg       0.61      0.66      0.60      2000
weighted avg       0.78      0.67      0.70      2000

[[1083  524]
 [ 137  256]]


In [68]:
from sklearn.metrics import accuracy_score

# y_test: true labels
# out_df['Predicted_Exited']: predicted labels
accuracy = accuracy_score(y_test, out_df['Predicted_Exited'])
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6695


In [69]:
data["Exited"].value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [70]:
y_test.value_counts()

Exited
0    1607
1     393
Name: count, dtype: int64

In [71]:
out_df['Predicted_Exited'].value_counts()

Predicted_Exited
0    1220
1     780
Name: count, dtype: int64