## Churn Classification

##### This data set contains details of a bank's customers and the target variable is a binary variable reflecting the fact whether the customer left the bank (closed his account) or he continues to be a customer.
###### url: https://www.kaggle.com/datasets/shrutimechlearn/churn-modelling

In [1]:
import pandas as pd

df = pd.read_csv('./resources/Churn_Modelling.csv')

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [2]:
## drop unneccesary columns

df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
## check for null values
df.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

No null values in df

##### Convert categorical data into Numerical data

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

gender_le = LabelEncoder()

In [5]:
df['Gender'] = gender_le.fit_transform(df['Gender'])

df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [6]:
## For geography there are three unique values
df['Geography'].nunique()

3

In [7]:
geo_ohe = OneHotEncoder()
geo_encoder_data = geo_ohe.fit_transform(df[['Geography']])

In [8]:
geo_ohe.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

convert this enocder into df

In [9]:
geo_pd = pd.DataFrame(geo_encoder_data,columns=geo_ohe.get_feature_names_out(['Geography']))

ValueError: Shape of passed values is (10000, 1), indices imply (10000, 3)

In [10]:
geo_pd = pd.DataFrame(geo_encoder_data.toarray(),columns=geo_ohe.get_feature_names_out(['Geography']))
geo_pd.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [11]:
df = pd.concat([df, geo_pd],axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,France,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,France,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [12]:
df.drop(['Geography'], axis=1, inplace=True)
df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


Now we have converted the categorical data into numeric data
so for future use we need to saves this encoders i.e
- label encoder for gender - gender_le
- onehote encoder for geography - geo_ohe

In [13]:
import pickle


with open('encoder_label.pkl', 'wb') as file:
  pickle.dump(gender_le, file)

with open('encoder_one_hot_geography.pkl', 'wb') as file:
  pickle.dump(geo_ohe, file)

In [14]:
## Divide the feature in dependent and independent feature
from sklearn.model_selection import train_test_split

X = df.drop(['Exited'],axis=1)
y = df['Exited']


In [15]:
## Train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
## Scale the feature
from sklearn.preprocessing import StandardScaler

standardScaler = StandardScaler()

In [17]:
X_train = standardScaler.fit_transform(X_train)
X_test = standardScaler.transform(X_test)

In [18]:
## Save this scaler 

with open('scaler_standard.pkl', 'wb') as file:
  pickle.dump(standardScaler, file)

## ANN Implementation

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from datetime import datetime

##### Build ANN model

In [20]:
no_of_features = X_train.shape[1]
print(no_of_features)

12


In [21]:
## build ANN mode
model = Sequential([
  # Hidden layer 1 connected with input layer
  # Dense(no of neurons, activation='activation_function_name', input shape=(12, ))
  Dense(64, activation='relu', input_shape=(no_of_features,)),
  Dense(32, activation='relu'), ## HL2
  Dense(1, activation='sigmoid') ## output layer

])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.summary()

In [26]:
## optimizer
adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
binary_crossentropy_loss= tf.keras.losses.BinaryCrossentropy()

In [27]:

## compile the model

# loss for binary we use binary_crossentrophy for multiclass we use parse_crossentrophy
model.compile(optimizer=adam_optimizer, loss=binary_crossentropy_loss, metrics=['accuracy'])

##### setup the tensorboard

In [44]:
log_dir = "logs/fit/" + datetime.now().strftime("%d%m%Y-%H%M%S")

tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [45]:
## setup early stopping
# scenario where in only 20 epochs, my model has trained at this batch level, 
# and after that it is just varying by 1 or 2%.
# Then it is not necessary that you need to probably train that particular model for 100 epochs
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

##### Train model

In [46]:
history = model.fit(
  X_train, y_train,
  validation_data=(X_test,y_test),
  epochs=100,
  callbacks=[tensorflow_callback,early_stopping_callback]

)

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 803us/step - accuracy: 0.8694 - loss: 0.3121 - val_accuracy: 0.8545 - val_loss: 0.3479
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 770us/step - accuracy: 0.8668 - loss: 0.3171 - val_accuracy: 0.8635 - val_loss: 0.3473
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step - accuracy: 0.8734 - loss: 0.3040 - val_accuracy: 0.8550 - val_loss: 0.3495
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 691us/step - accuracy: 0.8709 - loss: 0.3106 - val_accuracy: 0.8590 - val_loss: 0.3531
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.8673 - loss: 0.3134 - val_accuracy: 0.8645 - val_loss: 0.3458
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 691us/step - accuracy: 0.8730 - loss: 0.3042 - val_accuracy: 0.8570 - val_loss: 0.3476
Epoch 7/10

In [47]:
## save this model
model.save('model.h5') ## h5 file is comptaible with keras



In [48]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [49]:
%tensorboard --logdir logs/fit

Reusing TensorBoard on port 6006 (pid 3588), started 0:02:19 ago. (Use '!kill 3588' to kill it.)