In [1]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
tf.reset_default_graph()
tf.set_random_seed(42)

# 1. Read the dataset

In [3]:
import pandas as pd

bank_df = pd.read_csv('bank.csv')

# 2. Drop the columns which are unique for all users like IDs

In [4]:
bank_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
bank_df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [6]:
bank_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder
bank_df['Gender'] = le.fit_transform(bank_df['Gender'], bank_df['Gender'])

In [8]:
bank_df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,0,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,0,41,1,83807.86,1,0,1,112542.58,0
2,502,France,0,42,8,159660.8,3,1,0,113931.57,1
3,699,France,0,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,0,43,2,125510.82,1,1,1,79084.1,0


In [9]:
bank_df = pd.concat([bank_df,pd.get_dummies(bank_df['Geography'], prefix='geography')],axis=1)
bank_df.drop(['Geography'],axis=1, inplace=True)
bank_df.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,geography_France,geography_Germany,geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,0,1


# 3. Distinguish the feature and target set

In [10]:
X = bank_df.drop('Exited', axis=1)
y = bank_df['Exited']

In [11]:
X.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,geography_France,geography_Germany,geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,0,0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0,1
2,502,0,42,8,159660.8,3,1,0,113931.57,1,0,0
3,699,0,39,1,0.0,2,0,0,93826.63,1,0,0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0,1


# 4. Divide the data set into Train and test sets

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Normalize the train and test data (2.5 points)

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [14]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [15]:
print(X_train)
print(X_train.shape)

[[-0.34459497 -1.09823226 -0.65674999 ... -1.00171576 -0.57559072
   1.73073215]
 [-0.09518109 -1.09823226 -0.46637979 ... -1.00171576  1.73734559
  -0.57779016]
 [-0.94734518  0.91055421 -0.56156489 ...  0.99828718 -0.57559072
  -0.57779016]
 ...
 [ 0.86090545 -1.09823226 -0.08563939 ...  0.99828718 -0.57559072
  -0.57779016]
 [ 0.15423279  0.91055421  0.39028611 ...  0.99828718 -0.57559072
  -0.57779016]
 [ 0.46600014  0.91055421  1.1517669  ... -1.00171576  1.73734559
  -0.57779016]]
(7000, 12)


# 6. Initialize & build the model (10 points)

In [16]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.BatchNormalization(input_shape=(12,)))
model.add(tf.keras.layers.Dense(activation = 'relu', input_dim = 13, units=6))
model.add(tf.keras.layers.Dense(activation = 'sigmoid', units=1))
sgd_optimizer = tf.keras.optimizers.SGD(lr=0.03)
model.compile(optimizer=sgd_optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f0645719550>

In [17]:
prediction = model.predict(X_test)
print(prediction)
prediction = (prediction > 0.5)
print(prediction)

[[0.06524675]
 [0.06862003]
 [0.13487844]
 ...
 [0.05650419]
 [0.01478031]
 [0.16426452]]
[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


# 7. Optimize the model (5 points)

In [18]:
model1 = tf.keras.models.Sequential()

model1.add(tf.keras.layers.BatchNormalization(input_shape=(12,)))
model1.add(tf.keras.layers.Dense(50, activation='relu'))
model1.add(tf.keras.layers.Dense(100, activation='relu'))
model1.add(tf.keras.layers.Dense(150, activation='relu'))
model1.add(tf.keras.layers.Dense(1, activation='sigmoid'))

sgd_optimizer = tf.keras.optimizers.SGD(lr=0.03)
model1.compile(optimizer=sgd_optimizer, loss='binary_crossentropy', metrics=['accuracy'])
model1.fit(X_train, y_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f063c7b3908>

# 8. Predict the results using 0.5 as a threshold (5 points)

In [19]:
prediction_opt = model1.predict(X_test)
print(prediction_opt)
prediction_opt = (prediction_opt > 0.5)
print(prediction_opt)

[[0.07237083]
 [0.01274317]
 [0.08000956]
 ...
 [0.06727649]
 [0.01307457]
 [0.22016107]]
[[False]
 [False]
 [False]
 ...
 [False]
 [False]
 [False]]


# 9. Print the Accuracy score and confusion matrix (2.5 points)

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [21]:
print(accuracy_score(y_test, prediction))
print(confusion_matrix(y_test, prediction))

0.869
[[2341   75]
 [ 318  266]]


In [22]:
print("After optimization: ")
print(accuracy_score(y_test, prediction_opt))
print(confusion_matrix(y_test, prediction_opt))

After optimization: 
0.864
[[2291  125]
 [ 283  301]]
