# Artificial Neural Network First try
# Classification problem
# Convolusional Neural Networks are used in CV.

# Theano - Numerical Computions library (based on Numpy). Runs on GPU too.
# GPU is better for NN. As parallel processing could be handled by GPU

# Tensorflow - Runs on CPU/GPU. Built by Google Brain
# We build Deep Neural Network from Scratch using Tensorflow and Tensorflow

# Keras - Automates the creation of Deep Neural Network with few lines of code
# Built at Google.

# Part 1 - Data Preprocessing

In [157]:
# importing all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [158]:
# importing the dataset - Bank Churn analysis - Classification problem.
dataset = pd.read_csv('Churn_Modelling.csv')

In [159]:
dataset

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [160]:
# Impact of Independent Variables on Dependent Churn
# CreditScore, Geography, Gender, Age, Tenure, Balance, NumOfProducts, HasCrCard, IsActiveMember, EstimatedSalary
X = dataset.iloc[:,3:13].values

In [161]:
X.shape

(10000, 10)

In [162]:
# independent variable (Exited)
y = dataset.iloc[:, 13].values

In [163]:
y.shape

(10000,)

In [164]:
# We will be doing Onehot Encoding as we have Categorical variables
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# labelencoder_Geography = LabelEncoder()
# labelencoder_Gender = LabelEncoder()
# # We will need to encode only Geography and Gender
# # for Geography
# X[:,1] = labelencoder_Gender.fit_transform(X[:,1])
# # for Gender
# X[:,2] = labelencoder_Gender.fit_transform(X[:,2])

# As Geography and Gender aren't ordinal (Categorical variables aren't ordinal)
# To remove dummy variable trap we are creating the onehot encoding only for Country.
#onehotencoder = OneHotEncoder(categories=[])

# NOTE - Categorical_features as previously used is deprecated and the new version supports direct usage 
# of onehotencoder instead of labelencoder.

# Link - https://stackoverflow.com/questions/54345667/onehotencoder-categorical-features-depreciated-how-to-transform-specific-column

# We can do either this
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import make_column_transformer
# A = make_column_transformer(
#     (OneHotEncoder(categories='auto'), [0]), 
#     remainder="passthrough")

# x=A.fit_transform(x)

# or

# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# ct = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder = 'passthrough')
# X = ct.fit_transform(X)

In [165]:
X

array([[619, 'France', 'Female', ..., 1, 1, 101348.88],
       [608, 'Spain', 'Female', ..., 0, 1, 112542.58],
       [502, 'France', 'Female', ..., 1, 0, 113931.57],
       ...,
       [709, 'France', 'Female', ..., 0, 1, 42085.58],
       [772, 'Germany', 'Male', ..., 1, 0, 92888.52],
       [792, 'France', 'Female', ..., 1, 0, 38190.78]], dtype=object)

In [166]:
# Initially label encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
labelencoder_Geography = LabelEncoder()
labelencoder_Gender = LabelEncoder()
# We will need to encode only Geography and Gender
# for Geography
X[:,1] = labelencoder_Gender.fit_transform(X[:,1])
# for Gender
X[:,2] = labelencoder_Gender.fit_transform(X[:,2])



In [167]:
X.shape

(10000, 10)

In [168]:
X

array([[619, 0, 0, ..., 1, 1, 101348.88],
       [608, 2, 0, ..., 0, 1, 112542.58],
       [502, 0, 0, ..., 1, 0, 113931.57],
       ...,
       [709, 0, 0, ..., 0, 1, 42085.58],
       [772, 1, 1, ..., 1, 0, 92888.52],
       [792, 0, 0, ..., 1, 0, 38190.78]], dtype=object)

In [169]:
# then one hot encoding all the independent variables that are not ordinal
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([("Country", OneHotEncoder(), [1])], remainder = 'passthrough')
X = ct.fit_transform(X)

In [170]:
X.shape

(10000, 12)

In [171]:
X[0]

array([1.0, 0.0, 0.0, 619, 0, 42, 2, 0.0, 1, 1, 1, 101348.88],
      dtype=object)

In [172]:
# Here in the X[0] we can see the Country and Gender being modified using one-hot-encoder
# Now lets drop one of the columns in Country to avoid Dummy variable Trap, for Gender it is anyway 0,1 in the same columsn


In [173]:
X.shape

(10000, 12)

In [174]:
type(X)

numpy.ndarray

In [175]:
# 1 is the axis for column here.
# A 2-dimensional array has two corresponding axes: the first running vertically downwards across rows (axis 0), and the second running horizontally across columns (axis 1).
#X = np.delete(X,[0,3],1)
X = X[:, 1:]

In [176]:
X.shape

(10000, 11)

In [177]:
# Final X
X

array([[0.0, 0.0, 619, ..., 1, 1, 101348.88],
       [0.0, 1.0, 608, ..., 0, 1, 112542.58],
       [0.0, 0.0, 502, ..., 1, 0, 113931.57],
       ...,
       [0.0, 0.0, 709, ..., 0, 1, 42085.58],
       [1.0, 0.0, 772, ..., 1, 0, 92888.52],
       [0.0, 0.0, 792, ..., 1, 0, 38190.78]], dtype=object)

In [156]:
# Final X.shape
X.shape

(10000, 11)

In [178]:
X[0]

array([0.0, 0.0, 619, 0, 42, 2, 0.0, 1, 1, 1, 101348.88], dtype=object)

# Part 2 - Train Test Split

In [180]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [182]:
X_train.shape

(8000, 11)

In [183]:
X_test.shape

(2000, 11)

In [184]:
y_train.shape

(8000,)

In [185]:
y_test.shape

(2000,)

# Part 3 - Feature Scaling is very important as ANN is computation intensive.

In [186]:
from sklearn.preprocessing import StandardScaler

In [187]:
sc_X = StandardScaler()

In [189]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [191]:
X_test

array([[ 1.62776996, -0.57427105, -0.56129438, ...,  0.66011376,
         0.97628121,  1.62185911],
       [-0.61433742, -0.57427105, -1.33847768, ...,  0.66011376,
        -1.02429504,  0.504204  ],
       [-0.61433742,  1.74133801,  0.58347561, ...,  0.66011376,
         0.97628121, -0.41865644],
       ...,
       [-0.61433742,  1.74133801, -0.76084144, ...,  0.66011376,
        -1.02429504,  0.72775202],
       [ 1.62776996, -0.57427105, -0.0046631 , ...,  0.66011376,
         0.97628121, -1.54162886],
       [ 1.62776996, -0.57427105, -0.81335383, ...,  0.66011376,
        -1.02429504,  1.62356528]])

In [194]:
X_train[0]

array([-0.5698444 ,  1.74309049,  0.16958176, -1.09168714, -0.46460796,
        0.00666099, -1.21571749,  0.8095029 ,  0.64259497, -1.03227043,
        1.10643166])

In [195]:
X_test[0]

array([ 1.62776996, -0.57427105, -0.56129438, -1.11339196, -0.39401698,
        0.9869706 ,  0.85962168, -0.8733766 ,  0.66011376,  0.97628121,
        1.62185911])

# Most Important
# Part 4 - Keras based Artificial Neural Network

In [196]:
import keras

Using TensorFlow backend.


In [198]:
# importing requried modules sequential (initialize a Neural Net) and dense (To add the Layers)
from keras.models import Sequential
from keras.layers import Dense

In [199]:
# Initializing the Artificial Neural Network
# Object getting created here is itself the ANN
classifier = Sequential()

In [200]:
classifier

<keras.engine.sequential.Sequential at 0x1a4d1399b0>

# Steps in ANN - Stochastic Gradient Descent Method
* Step 1: Randomly initialize the weights to close to 0 values but not 0
* Step 2: Input the first observation in the input layer. Each feature into a input node. Input nodes thus will equal with the number of independent variables.(in our case it is 11).
* Step 3: Forward-Propagation.
* Step 4: Compare the predicted result with the Actual result. Measure the generated error.
* Step 5: Back-Propagation (Update the weights according to how much they are responsible for the error).
* Step 6: Repeat 1 to 5 and update the weights after each observation (Reinforcement or SGD method).Repeat 1 to 5 and update the weights only after a batch of observations (Batch Learning).
* Step 7: When all the training data is passed through the ANN , it is an epoch. Do, more epochs.

In [204]:
# Also we need to select the Activation function in the hidden layers.
# We are selecting Rectifier for the hidden layers and Sigmoid for the output layers.
# More the output of Activation function in the hidden layers, the better it passes the signals to the ANN.
# By using sigmoid Activation function at the output layers, we get the probabilities also (if customer stays at bank or not)
# We can design ranking based models too using that.

In [215]:
# Adding the input layer and hidden layers
# Rule of thumb - no of nodes in hidden layer avg(num of Input nodes + num of Output nodes)
classifier.add(Dense(units=6, activation='relu', input_shape=(11,)))

In [216]:
# Adding another layer.
classifier.add(Dense(units=6, activation='relu'))

In [217]:
# Adding the output layer.
classifier.add(Dense(units=1, activation='sigmoid'))

In [220]:
# Compiling the ANN.
# adam is nothing but SGD, for loss function logarithmic_cross_entropy.
classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [221]:
# Fitting the ANN (Classifier) to the training set.
# batch_size - after this size weights get updated. 1 is for SGD (Online), n for GD.
classifier.fit(X_train, y_train, batch_size=10, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x1a4da82710>

# Result
# After 100 epochs it reached an accuracy of 85.66 %

# Testing

In [225]:
# this returns the probabilities but not T or F (whether the customer stays or leaves the bank.)
y_pred = classifier.predict(X_test)

In [227]:
y_pred.shape
# 0.19 % of leaving the bank

(2000, 1)

In [228]:
# returns True for y_pred > 0.50 lese False
y_pred = (y_pred > 0.5)

In [229]:
y_pred

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

# Confusion Matrix

In [230]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [231]:
cm

array([[1544,   51],
       [ 243,  162]])

In [232]:
# Accuracy - 85.3 % Accurate
(1544 + 162) / 2000

0.853

In [None]:
# for initializing weights - kernel_initializer='uniform'