###### 
<img src="images/DNN_flow.png" style="width:800px;height:500px;">
<caption><center> Figure 1 : Flow for Constructing a Deep Neural Network</center></caption><br>

In [None]:
# Import libraries
import numpy as np
import torch
import tensorflow as tf
from matplotlib import pyplot as plt

# Create and visualize dataset
m = 1000     # m = total number of samples (set to even number)
n = 2      # n = number of features of input. 2 implies 2 dimensional (2D)

# Creation of dataset for Y = 0
angles = np.arange(0, 360, 360 / (m//2))   # Half the sample will be Y=0
amp_X0 = np.random.randn(m//2)
X0 = np.zeros((n+1, m//2))
X0[0] = np.sin(angles * np.pi/180) * amp_X0
X0[1] = np.cos(angles * np.pi/180) * amp_X0
X0[2] = np.zeros((m//2))
# print('X0 shape = {}, X0 = \n{}\n'.format(X0.shape, X0))

# Creation of dataset for Y = 1
angles = np.arange(0, 360, 360 / (m - m//2))
amp_X1 = np.random.randn((m - m//2))*3
X1 = np.zeros((n+1, (m - m//2)))
X1[0] = np.sin(angles * np.pi/180) * amp_X1 + amp_X0*3 + 10
X1[1] = np.cos(angles * np.pi/180) * amp_X1 + amp_X0*3 + 10
X1[2] = np.ones(((m - m//2)))
# print('X1 shape = {}, X1 = \n{}\n'.format(X1.shape, X1))

# Combine and shuffle arrays
X = np.concatenate((X0, X1),axis=1)
X = X.T
np.random.shuffle(X)    # Shuffle only works on axis=0, thus need the X.T before and after
X = X.T
Y = X[2:]
X = X[0:2]

# Visualize dataset with plot
plt.scatter(X0[0],X0[1])
plt.scatter(X1[0],X1[1])
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.legend(['Label 0', 'Label 1'])

# Remove 20% of the dataset to be reserved for testing
X_test = X[:,int(np.floor(0.8*m)):]
Y_test = Y[:,int(np.floor(0.8*m)):]
X = X[:,:int(np.floor(0.8*m))]
Y = Y[:,:int(np.floor(0.8*m))]
m = Y.shape[1]
print('Value of m = {}, Shape X_test = {}, Shape X = {}\n'.format(m, X_test.shape, X.shape))

## Step 1: Initialize the Parameters
<img src="images/initialization.png" style="width:800px;height:500px;">
<caption><center> Figure 2 : Initialize the Parameters</center></caption><br>

In [None]:
# Hyper-parameters definition
L = 4                 # Total number of layers defined to be 4
# nl = [5, 4, 3, 1]    # Define number of nodes / features for each layer
nl = [56, 128, 56, 1]

# Include input features into layers of DNN
n_l = [n] 
n_l.extend(nl)

# Numpy initialization of weights and bias
W_numpy = [0]
b_numpy = [0]
for i in range(1, len(n_l)):
    W_numpy_temp = np.random.randn(n_l[i], n_l[i-1]) * 0.01    # Multiply by a small number 0.01 to reduce the value of z, to speed up learning (esp for sigmoid activation)
    W_numpy.append(W_numpy_temp)
    
    b_numpy_temp = np.ones((n_l[i], 1)) * 0.001   # Add a small bias to prevent divide by 0 operation
    b_numpy.append(b_numpy_temp)

print('Shapes of W_numpy are: {}, {}, {}, {}'.format(W_numpy[1].shape, W_numpy[2].shape, W_numpy[3].shape, W_numpy[4].shape))
print('Shapes of b_numpy are: {}, {}, {}, {}\n'.format(b_numpy[1].shape, b_numpy[2].shape, b_numpy[3].shape, b_numpy[4].shape))


## Pytorch initialization of weights and bias
W_torch = [0]
b_torch = [0]
for i in range(1, len(W_numpy)):    # Convert the numpy array to torch
    W_torch.append(torch.tensor(W_numpy[i]))
    b_torch.append(torch.tensor(b_numpy[i]))
                   
print('Shapes of W_torch are: {}, {}, {}, {}'.format(W_torch[1].shape, W_torch[2].shape, W_torch[3].shape, W_torch[4].shape))
print('Shapes of b_torch are: {}, {}, {}, {}\n'.format(b_torch[1].shape, b_torch[2].shape, b_torch[3].shape, b_torch[4].shape))


## Tensorflow initialization of weights and bias
W_tf = [0]
b_tf = [0]
for i in range(1, len(W_numpy)):    # Convert the numpy array to torch
    W_tf.append(tf.convert_to_tensor(W_numpy[i]))
    b_tf.append(tf.convert_to_tensor(b_numpy[i]))
                   
print('Shapes of W_tf are: {}, {}, {}, {}'.format(W_tf[1].shape, W_tf[2].shape, W_tf[3].shape, W_tf[4].shape))
print('Shapes of b_tf are: {}, {}, {}, {}\n'.format(b_tf[1].shape, b_tf[2].shape, b_tf[3].shape, b_tf[4].shape))

# Step 2: Perform Forward Pass
<img src="images/forward_pass.png" style="width:800px;height:500px;">
<caption><center> Figure 3 : Perform forward pass </center></caption><br>

Note:
- For all hidden layers, ReLU activation function is used
- For last layer (i.e. output layer), Sigmoid activation function is used
- Cross Entropy Loss is used as the Cost function

In [None]:
# Numpy forward pass
# Definition of sigmoid and relu function for numpy
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(Z,0)

Z_numpy = [0]
A_numpy = [X]
Y_numpy = Y
for i in range(1, len(n_l)):
#     print(W_numpy[i].shape, A_numpy[i-1].shape, np.matmul(W_numpy[i],A_numpy[i-1]).shape)
    Z_temp = np.matmul(W_numpy[i],A_numpy[i-1]) + b_numpy[i]
    Z_numpy.append(Z_temp)
    if i != len(n_l):
        A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
        A_numpy.append(A_temp) 
    else:
        A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_numpy.append(A_temp)   
J_numpy = (-1)*np.sum(Y_numpy*np.log(A_numpy[L]) + (1-Y_numpy)*np.log(1-A_numpy[L]))/m	

# Pytorch forward pass
Z_torch = [0]
A_torch = [torch.tensor(X)]
Y_torch = torch.tensor(Y)
sigmoid = torch.nn.Sigmoid()    # Use sigmoid definition from pytorch nn library
relu = torch.nn.ReLU()          # Use ReLU definition from pytorch nn library
for i in range(1, len(n_l)):
#     print(W_torch[i].shape, A_torch[i-1].shape, torch.matmul(W_torch[i],A_torch[i-1]).shape)
    Z_temp = torch.matmul(W_torch[i],A_torch[i-1]) + b_torch[i]
    Z_torch.append(Z_temp)
    if i != len(n_l):
        A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
        A_torch.append(A_temp) 
    else:
        A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_torch.append(A_temp)   
J_torch = (-1)*torch.sum(Y_torch*torch.log(A_torch[L]) +  (1-Y_torch)*torch.log(1-A_torch[L]))/m
        
        
# Tensorflow forward pass
Z_tf = [0]
A_tf = [tf.convert_to_tensor(X)]
Y_tf = tf.convert_to_tensor(Y)
for i in range(1, len(n_l)):
#     print(W_tf[i].shape, A_tf[i-1].shape, tf.matmul(W_tf[i],A_tf[i-1]).shape)

    Z_temp = tf.matmul(W_tf[i],A_tf[i-1]) + b_tf[i]
    Z_tf.append(Z_temp)
    if i != len(n_l):
        A_temp = tf.keras.activations.relu(Z_temp)   # ReLU activation for all hidden layers
        A_tf.append(A_temp) 
    else:
        A_temp = tf.keras.activations.sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_tf.append(A_temp)  
J_tf = (-1)*tf.math.reduce_sum(Y_tf*tf.math.log(A_tf[L]) +  (1-Y_tf)*tf.math.log(1-A_tf[L]))/m

print('\nLoss function calculated from numpy, pytorch and tensorflow are {}, {} and {}'.format(J_numpy, J_torch, J_tf))

# Step 3: Perform Backward Pass
<img src="images/backward_pass.png" style="width:800px;height:500px;">
<caption><center> Figure 4 : Perform backward pass </center></caption><br>

In [None]:
# Numpy backward pass
dA_numpy = []
dZ_numpy = []
dW_numpy = []
db_numpy = []

# For last (output) layer
dAL = - (np.divide(Y_numpy, A_numpy[L]) - np.divide(1 - Y_numpy, 1 - A_numpy[L]))   # Find grad of last activation layer due to CEL cost function
dZ_temp = dAL * A_numpy[L] * (1 - A_numpy[L])    # Backward function for sigmoid activation
dW_temp = np.matmul(dZ_temp, A_numpy[L-1].T) / m
db_temp = np.sum(dZ_temp, axis=1, keepdims=True)/m
dA_numpy.append(dAL)
dZ_numpy.append(dZ_temp)
dW_numpy.append(dW_temp)
db_numpy.append(db_temp)

# For all other layers
for i in reversed(range(1, L)):
    dA_temp = np.matmul(W_numpy[i+1].T,dZ_numpy[-1])	
    dZ_temp = np.array(dA_temp, copy=True)         # Backward pass for ReLU, step 1: copy array
    dZ_temp[Z_numpy[i] <= 0] = 0                   # Backward pass for ReLU, step 2: Set all dZ=0 where Z < 0
    dW_temp = np.matmul(dZ_temp, A_numpy[i-1].T) / m
    db_temp = np.sum(dZ_temp, axis=1, keepdims=True)/m
    dA_numpy.append(dA_temp)
    dZ_numpy.append(dZ_temp)
    dW_numpy.append(dW_temp)
    db_numpy.append(db_temp)
    	
    
# Reverse the list so that indexes are respective to layer number.
# This is so as for backward pass, all calculations are done from last layer to first layer
dA_numpy.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
dZ_numpy.append(0)
dW_numpy.append(0)
db_numpy.append(0)
dA_numpy = list(reversed(dA_numpy))
dZ_numpy = list(reversed(dZ_numpy))
dW_numpy = list(reversed(dW_numpy))
db_numpy = list(reversed(db_numpy))




# Pytorch backward pass
dA_torch = []
dZ_torch = []
dW_torch = []
db_torch = []

# For last (output) layer
dAL = - (torch.div(Y_torch, A_torch[L]) - torch.div(1 - Y_torch, 1 - A_torch[L]))   # Find grad of last activation layer due to CEL cost function
dZ_temp = dAL * A_torch[L] * (1 - A_torch[L])    # Backward function for sigmoid activation
dW_temp = torch.matmul(dZ_temp, A_torch[L-1].T) / m
db_temp = torch.sum(dZ_temp, dim=1, keepdim=True)/m
dA_torch.append(dAL)
dZ_torch.append(dZ_temp)
dW_torch.append(dW_temp)
db_torch.append(db_temp)

# For all other layers
for i in reversed(range(1, L)):
    dA_temp = torch.matmul(W_torch[i+1].T,dZ_torch[-1])	
    dZ_temp = dA_temp.clone()                      # Backward pass for ReLU, step 1: copy array
    dZ_temp[Z_torch[i] <= 0] = 0                   # Backward pass for ReLU, step 2: Set all dZ=0 where Z < 0
    dW_temp = torch.matmul(dZ_temp, A_torch[i-1].T) / m
    db_temp = torch.sum(dZ_temp, dim=1, keepdim=True)/m
    dA_torch.append(dA_temp)
    dZ_torch.append(dZ_temp)
    dW_torch.append(dW_temp)
    db_torch.append(db_temp)
    
# Reverse the list so that indexes are respective to layer number.
# This is so as for backward pass, all calculations are done from last layer to first layer
dA_torch.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
dZ_torch.append(0)
dW_torch.append(0)
db_torch.append(0)
dA_torch = list(reversed(dA_torch))
dZ_torch = list(reversed(dZ_torch))
dW_torch = list(reversed(dW_torch))
db_torch = list(reversed(db_torch))





# Tensorflow backward pass
dA_tf = []
dZ_tf = []
dW_tf = []
db_tf = []

# For last (output) layer
dAL = - (tf.math.divide(Y_tf, A_tf[L]) - tf.math.divide(1 - Y_tf, 1 - A_tf[L]))   # Find grad of last activation layer due to CEL cost function
dZ_temp = dAL * A_tf[L] * (1 - A_tf[L])    # Backward function for sigmoid activation
dW_temp = tf.matmul(dZ_temp, tf.transpose(A_tf[L-1])) / m
db_temp = tf.math.reduce_sum(dZ_temp, axis=1, keepdims=True)/m
dA_tf.append(dAL)
dZ_tf.append(dZ_temp)
dW_tf.append(dW_temp)
db_tf.append(db_temp)

# For all other layers
for i in reversed(range(1, L)):
    dA_temp = tf.matmul(tf.transpose(W_tf[i+1]),dZ_tf[-1])	
    dZ_temp = tf.identity(dA_temp)                     # Backward pass for ReLU, step 1: copy array
    dZ_temp = tf.where(Z_tf[i] > 0. , dZ_temp , [0])
    dW_temp = tf.matmul(dZ_temp, tf.transpose(A_tf[i-1])) / m
    db_temp = tf.math.reduce_sum(dZ_temp, axis=1, keepdims=True)/m
    dA_tf.append(dA_temp)
    dZ_tf.append(dZ_temp)
    dW_tf.append(dW_temp)
    db_tf.append(db_temp)
    
# Reverse the list so that indexes are respective to layer number.
# This is so as for backward pass, all calculations are done from last layer to first layer
dA_tf.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
dZ_tf.append(0)
dW_tf.append(0)
db_tf.append(0)
dA_tf = list(reversed(dA_tf))
dZ_tf = list(reversed(dZ_tf))
dW_tf = list(reversed(dW_tf))
db_tf = list(reversed(db_tf))

print(dW_tf[2], '\n', dW_torch[2], '\n', dW_numpy[2])
print(A_tf[0], '\n', 'A_torch[0]', '\n', A_numpy[0])

# Step 4: Update Parameters
<img src="images/update_params.png" style="width:800px;height:500px;">
<caption><center> Figure 5 : Update Parameters </center></caption><br>

In [None]:
# Update parameters
learn_rate = 0.01

# Numpy update
for i in range(1,L+1):
    W_numpy[i] = W_numpy[i] - learn_rate*dW_numpy[i]
    b_numpy[i] = b_numpy[i] - learn_rate*db_numpy[i]

# Pytorch update
for i in range(1,L+1):
    W_torch[i] = W_torch[i] - learn_rate*dW_torch[i]
    b_torch[i] = b_torch[i] - learn_rate*db_torch[i]
    
# Tensorflow update
for i in range(1,L+1):
    W_tf[i] = W_tf[i] - learn_rate*dW_tf[i]
    b_tf[i] = b_tf[i] - learn_rate*db_tf[i]

# Step 5: Perform prediction on model and calculate accuracy
This step replicates exactly the same steps as Step 2 with the following changes:
1. A_numpy = [X_test]
2. Y_numpy = [Y_test]
3. A_torch = [torch.tensor(X_test)]
4. Y_torch = torch.tensor(Y_test)
5. A_tf = [tf.convert_to_tensor(X_test)]
6. Y_tf = tf.convert_to_tensor(Y_test)
7. See one cell later on prediction of output

In [None]:
# Numpy forward pass
# Definition of sigmoid and relu function for numpy
def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(Z,0)

Z_numpy = [0]
A_numpy = [X_test]
Y_numpy = Y_test
for i in range(1, len(n_l)):
#     print(W_numpy[i].shape, A_numpy[i-1].shape, np.matmul(W_numpy[i],A_numpy[i-1]).shape)
    Z_temp = np.matmul(W_numpy[i],A_numpy[i-1]) + b_numpy[i]
    Z_numpy.append(Z_temp)
    if i != len(n_l):
        A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
        A_numpy.append(A_temp) 
    else:
        A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_numpy.append(A_temp)   
J_numpy = (-1)*np.sum(Y_numpy*np.log(A_numpy[L]) + (1-Y_numpy)*np.log(1-A_numpy[L]))/m	

# Pytorch forward pass
Z_torch = [0]
A_torch = [torch.tensor(X_test)]
Y_torch = torch.tensor(Y_test)
sigmoid = torch.nn.Sigmoid()    # Use sigmoid definition from pytorch nn library
relu = torch.nn.ReLU()          # Use ReLU definition from pytorch nn library
for i in range(1, len(n_l)):
#     print(W_torch[i].shape, A_torch[i-1].shape, torch.matmul(W_torch[i],A_torch[i-1]).shape)
    Z_temp = torch.matmul(W_torch[i],A_torch[i-1]) + b_torch[i]
    Z_torch.append(Z_temp)
    if i != len(n_l):
        A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
        A_torch.append(A_temp) 
    else:
        A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_torch.append(A_temp)   
J_torch = (-1)*torch.sum(Y_torch*torch.log(A_torch[L]) +  (1-Y_torch)*torch.log(1-A_torch[L]))/m
        
        
# Tensorflow forward pass
Z_tf = [0]
A_tf = [tf.convert_to_tensor(X_test)]
Y_tf = tf.convert_to_tensor(Y_test)
for i in range(1, len(n_l)):
#     print(W_tf[i].shape, A_tf[i-1].shape, tf.matmul(W_tf[i],A_tf[i-1]).shape)

    Z_temp = tf.matmul(W_tf[i],A_tf[i-1]) + b_tf[i]
    Z_tf.append(Z_temp)
    if i != len(n_l):
        A_temp = tf.keras.activations.relu(Z_temp)   # ReLU activation for all hidden layers
        A_tf.append(A_temp) 
    else:
        A_temp = tf.keras.activations.sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
        A_tf.append(A_temp)  
J_tf = (-1)*tf.math.reduce_sum(Y_tf*tf.math.log(A_tf[L]) +  (1-Y_tf)*tf.math.log(1-A_tf[L]))/m

print('\nLoss function calculated from numpy, pytorch and tensorflow are {}, {} and {}'.format(J_numpy, J_torch, J_tf))

In [None]:
# Predict output using the output of the last activation

# Numpy prediction
predict_numpy = A_numpy[L].round()
accuracy_numpy = np.count_nonzero(predict_numpy == Y_numpy) / Y_numpy.shape[1] * 100

# Pytorch prediction
predict_torch = A_torch[L].round()
accuracy_torch = torch.sum(predict_torch == Y_torch) / float(Y_torch.shape[1]) * 100

# Tensorflow prediction
predict_tf = tf.math.round(A_tf[L])
accuracy_tf = tf.reduce_sum(tf.cast(predict_tf == Y_tf, tf.float32)) / Y_tf.shape[1] * 100

print('Accuracy of numpy, pytorch and tensorflow prediction are {}%, {}% and {}%respectively\n'.format(accuracy_numpy, accuracy_torch, accuracy_tf))

# Plot test data and prediction
# Ground Truth
X0_test = X_test[:,Y_test[0,:] == 0]
X1_test = X_test[:,Y_test[0,:] == 1]

X0_numpy = X_test[:,predict_numpy[0,:] == 0]
X1_numpy = X_test[:,predict_numpy[0,:] == 1]

X0_torch = X_test[:,predict_torch[0,:] == 0]
X1_torch = X_test[:,predict_torch[0,:] == 1]

X0_tf = X_test[:,predict_tf[0,:] == 0]
X1_tf = X_test[:,predict_tf[0,:] == 1]


# Ground Truth
plt.figure(figsize=(10, 10))

plt.subplot(2, 2, 1)
plt.scatter(X0_test[0], X0_test[1])
plt.scatter(X1_test[0], X1_test[1])
plt.legend(['0', '1'])
plt.title('Ground Truth')

# Numpy Prediction
plt.subplot(2, 2, 2)
plt.scatter(X0_numpy[0], X0_numpy[1])
plt.scatter(X1_numpy[0], X1_numpy[1])
plt.legend(['0', '1'])
plt.title('Numpy Prediction')

# Pytorch Prediction
plt.subplot(2, 2, 3)
plt.scatter(X0_torch[0], X0_torch[1])
plt.scatter(X1_torch[0], X1_torch[1])
plt.legend(['0', '1'])
plt.title('Pytorch Prediction')

# Tensorflow Prediction
plt.subplot(2, 2, 4)
plt.scatter(X0_tf[0], X0_tf[1])
plt.scatter(X1_tf[0], X1_tf[1])
plt.legend(['0', '1'])
plt.title('Tensorflow Prediction')

# Step 6: Repeat steps 2 to 5 with a certain number of epoch to learn the relation

In [None]:
# Import libraries
import numpy as np
import torch
import tensorflow as tf
from matplotlib import pyplot as plt

# Create and visualize dataset
m = 1000     # m = total number of samples (set to even number)
n = 2      # n = number of features of input. 2 implies 2 dimensional (2D)

# Creation of dataset for Y = 0
angles = np.arange(0, 360, 360 / (m//2))   # Half the sample will be Y=0
amp_X0 = np.random.randn(m//2)
X0 = np.zeros((n+1, m//2))
X0[0] = np.sin(angles * np.pi/180) * amp_X0
X0[1] = np.cos(angles * np.pi/180) * amp_X0
X0[2] = np.zeros((m//2))
# print('X0 shape = {}, X0 = \n{}\n'.format(X0.shape, X0))

# Creation of dataset for Y = 1
angles = np.arange(0, 360, 360 / (m - m//2))
amp_X1 = np.random.randn((m - m//2))*3
X1 = np.zeros((n+1, (m - m//2)))
X1[0] = np.sin(angles * np.pi/180) * amp_X1 + amp_X0*3
X1[1] = np.cos(angles * np.pi/180) * amp_X1 + amp_X0*3
X1[2] = np.ones(((m - m//2)))
# print('X1 shape = {}, X1 = \n{}\n'.format(X1.shape, X1))

# Combine and shuffle arrays
X = np.concatenate((X0, X1),axis=1)
X = X.T
np.random.shuffle(X)    # Shuffle only works on axis=0, thus need the X.T before and after
X = X.T
Y = X[2:]
X = X[0:2]

# Visualize dataset with plot
plt.scatter(X0[0],X0[1])
plt.scatter(X1[0],X1[1])
plt.xlabel('Feature 0')
plt.ylabel('Feature 1')
plt.legend(['Label 0', 'Label 1'])

# Remove 20% of the dataset to be reserved for testing
X_test = X[:,int(np.floor(0.8*m)):]
Y_test = Y[:,int(np.floor(0.8*m)):]
X = X[:,:int(np.floor(0.8*m))]
Y = Y[:,:int(np.floor(0.8*m))]
m = Y.shape[1]
print('Value of m = {}, Shape X_test = {}, Shape X = {}\n'.format(m, X_test.shape, X.shape))

In [None]:
# Hyper-parameters definition
L = 4                 # Total number of layers defined to be 4
nl = [5, 4, 3, 1]    # Define number of nodes / features for each layer

# Include input features into layers of DNN
n_l = [n] 
n_l.extend(nl)

# Numpy initialization of weights and bias
W_numpy = [0]
b_numpy = [0]
for i in range(1, len(n_l)):
    W_numpy_temp = np.random.randn(n_l[i], n_l[i-1]) * 0.01    # Multiply by a small number 0.01 to reduce the value of z, to speed up learning (esp for sigmoid activation)
    W_numpy.append(W_numpy_temp)
    
    b_numpy_temp = np.ones((n_l[i], 1)) * 0.001   # Add a small bias to prevent divide by 0 operation
    b_numpy.append(b_numpy_temp)

print('Shapes of W_numpy are: {}, {}, {}, {}'.format(W_numpy[1].shape, W_numpy[2].shape, W_numpy[3].shape, W_numpy[4].shape))
print('Shapes of b_numpy are: {}, {}, {}, {}\n'.format(b_numpy[1].shape, b_numpy[2].shape, b_numpy[3].shape, b_numpy[4].shape))


## Pytorch initialization of weights and bias
W_torch = [0]
b_torch = [0]
for i in range(1, len(W_numpy)):    # Convert the numpy array to torch
    W_torch.append(torch.tensor(W_numpy[i]))
    b_torch.append(torch.tensor(b_numpy[i]))
                   
print('Shapes of W_torch are: {}, {}, {}, {}'.format(W_torch[1].shape, W_torch[2].shape, W_torch[3].shape, W_torch[4].shape))
print('Shapes of b_torch are: {}, {}, {}, {}\n'.format(b_torch[1].shape, b_torch[2].shape, b_torch[3].shape, b_torch[4].shape))


## Tensorflow initialization of weights and bias
W_tf = [0]
b_tf = [0]
for i in range(1, len(W_numpy)):    # Convert the numpy array to torch
    W_tf.append(tf.convert_to_tensor(W_numpy[i]))
    b_tf.append(tf.convert_to_tensor(b_numpy[i]))
                   
print('Shapes of W_tf are: {}, {}, {}, {}'.format(W_tf[1].shape, W_tf[2].shape, W_tf[3].shape, W_tf[4].shape))
print('Shapes of b_tf are: {}, {}, {}, {}\n'.format(b_tf[1].shape, b_tf[2].shape, b_tf[3].shape, b_tf[4].shape))

In [None]:
def forward_pass(X, W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf, Y, n_l, L, m):
    # Numpy forward pass
    # Definition of sigmoid and relu function for numpy
    def sigmoid(Z):
        return 1 / (1 + np.exp(-Z))

    def relu(Z):
        return np.maximum(Z,0)

    Z_numpy = [0]
    A_numpy = [X]
    Y_numpy = Y
    for i in range(1, len(n_l)):
    #     print(W_numpy[i].shape, A_numpy[i-1].shape, np.matmul(W_numpy[i],A_numpy[i-1]).shape)
        Z_temp = np.matmul(W_numpy[i],A_numpy[i-1]) + b_numpy[i]
        Z_numpy.append(Z_temp)
        if i != len(n_l):
            A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
            A_numpy.append(A_temp) 
        else:
            A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
            A_numpy.append(A_temp)   
    J_numpy = (-1)*np.sum(Y_numpy*np.log(A_numpy[L]) + (1-Y_numpy)*np.log(1-A_numpy[L]))/m	

    # Pytorch forward pass
    Z_torch = [0]
    A_torch = [torch.tensor(X)]
    Y_torch = torch.tensor(Y)
    sigmoid = torch.nn.Sigmoid()    # Use sigmoid definition from pytorch nn library
    relu = torch.nn.ReLU()          # Use ReLU definition from pytorch nn library
    for i in range(1, len(n_l)):
    #     print(W_torch[i].shape, A_torch[i-1].shape, torch.matmul(W_torch[i],A_torch[i-1]).shape)
        Z_temp = torch.matmul(W_torch[i],A_torch[i-1]) + b_torch[i]
        Z_torch.append(Z_temp)
        if i != len(n_l):
            A_temp = relu(Z_temp)   # ReLU activation for all hidden layers
            A_torch.append(A_temp) 
        else:
            A_temp = sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
            A_torch.append(A_temp)   
    J_torch = (-1)*torch.sum(Y_torch*torch.log(A_torch[L]) +  (1-Y_torch)*torch.log(1-A_torch[L]))/m


    # Tensorflow forward pass
    Z_tf = [0]
    A_tf = [tf.convert_to_tensor(X)]
    Y_tf = tf.convert_to_tensor(Y)
    for i in range(1, len(n_l)):
    #     print(W_tf[i].shape, A_tf[i-1].shape, tf.matmul(W_tf[i],A_tf[i-1]).shape)

        Z_temp = tf.matmul(W_tf[i],A_tf[i-1]) + b_tf[i]
        Z_tf.append(Z_temp)
        if i != len(n_l):
            A_temp = tf.keras.activations.relu(Z_temp)   # ReLU activation for all hidden layers
            A_tf.append(A_temp) 
        else:
            A_temp = tf.keras.activations.sigmoid(Z_temp)  # Sigmoid activation for last (output) layer. Binary classification
            A_tf.append(A_temp)  
    J_tf = (-1)*tf.math.reduce_sum(Y_tf*tf.math.log(A_tf[L]) +  (1-Y_tf)*tf.math.log(1-A_tf[L]))/m

    print('\nLoss function calculated from numpy, pytorch and tensorflow are {}, {} and {}'.format(J_numpy, J_torch, J_tf))
    
    return Z_numpy, A_numpy, Z_torch, A_torch, Z_tf, A_tf

In [None]:
def backward_pass(Z_numpy, A_numpy, Z_torch, A_torch, Z_tf, A_tf, Y_numpy, Y_torch, Y_tf, L, m):
    # Numpy backward pass
    dA_numpy = []
    dZ_numpy = []
    dW_numpy = []
    db_numpy = []

    # For last (output) layer
    dAL = - (np.divide(Y_numpy, A_numpy[L]) - np.divide(1 - Y_numpy, 1 - A_numpy[L]))   # Find grad of last activation layer due to CEL cost function
    dZ_temp = dAL * A_numpy[L] * (1 - A_numpy[L])    # Backward function for sigmoid activation
    dW_temp = np.matmul(dZ_temp, A_numpy[L-1].T) / m
    db_temp = np.sum(dZ_temp, axis=1, keepdims=True)/m
    dA_numpy.append(dAL)
    dZ_numpy.append(dZ_temp)
    dW_numpy.append(dW_temp)
    db_numpy.append(db_temp)

    # For all other layers
    for i in reversed(range(1, L)):
        dA_temp = np.matmul(W_numpy[i+1].T,dZ_numpy[-1])	
        dZ_temp = np.array(dA_temp, copy=True)         # Backward pass for ReLU, step 1: copy array
        dZ_temp[Z_numpy[i] <= 0] = 0                   # Backward pass for ReLU, step 2: Set all dZ=0 where Z < 0
        dW_temp = np.matmul(dZ_temp, A_numpy[i-1].T) / m
        db_temp = np.sum(dZ_temp, axis=1, keepdims=True)/m
        dA_numpy.append(dA_temp)
        dZ_numpy.append(dZ_temp)
        dW_numpy.append(dW_temp)
        db_numpy.append(db_temp)


    # Reverse the list so that indexes are respective to layer number.
    # This is so as for backward pass, all calculations are done from last layer to first layer
    dA_numpy.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
    dZ_numpy.append(0)
    dW_numpy.append(0)
    db_numpy.append(0)
    dA_numpy = list(reversed(dA_numpy))
    dZ_numpy = list(reversed(dZ_numpy))
    dW_numpy = list(reversed(dW_numpy))
    db_numpy = list(reversed(db_numpy))




    # Pytorch backward pass
    dA_torch = []
    dZ_torch = []
    dW_torch = []
    db_torch = []

    # For last (output) layer
    dAL = - (torch.div(Y_torch, A_torch[L]) - torch.div(1 - Y_torch, 1 - A_torch[L]))   # Find grad of last activation layer due to CEL cost function
    dZ_temp = dAL * A_torch[L] * (1 - A_torch[L])    # Backward function for sigmoid activation
    dW_temp = torch.matmul(dZ_temp, A_torch[L-1].T) / m
    db_temp = torch.sum(dZ_temp, dim=1, keepdim=True)/m
    dA_torch.append(dAL)
    dZ_torch.append(dZ_temp)
    dW_torch.append(dW_temp)
    db_torch.append(db_temp)

    # For all other layers
    for i in reversed(range(1, L)):
        dA_temp = torch.matmul(W_torch[i+1].T,dZ_torch[-1])	
        dZ_temp = dA_temp.clone()                      # Backward pass for ReLU, step 1: copy array
        dZ_temp[Z_torch[i] <= 0] = 0                   # Backward pass for ReLU, step 2: Set all dZ=0 where Z < 0
        dW_temp = torch.matmul(dZ_temp, A_torch[i-1].T) / m
        db_temp = torch.sum(dZ_temp, dim=1, keepdim=True)/m
        dA_torch.append(dA_temp)
        dZ_torch.append(dZ_temp)
        dW_torch.append(dW_temp)
        db_torch.append(db_temp)

    # Reverse the list so that indexes are respective to layer number.
    # This is so as for backward pass, all calculations are done from last layer to first layer
    dA_torch.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
    dZ_torch.append(0)
    dW_torch.append(0)
    db_torch.append(0)
    dA_torch = list(reversed(dA_torch))
    dZ_torch = list(reversed(dZ_torch))
    dW_torch = list(reversed(dW_torch))
    db_torch = list(reversed(db_torch))





    # Tensorflow backward pass
    dA_tf = []
    dZ_tf = []
    dW_tf = []
    db_tf = []

    # For last (output) layer
    dAL = - (tf.math.divide(Y_tf, A_tf[L]) - tf.math.divide(1 - Y_tf, 1 - A_tf[L]))   # Find grad of last activation layer due to CEL cost function
    dZ_temp = dAL * A_tf[L] * (1 - A_tf[L])    # Backward function for sigmoid activation
    dW_temp = tf.matmul(dZ_temp, tf.transpose(A_tf[L-1])) / m
    db_temp = tf.math.reduce_sum(dZ_temp, axis=1, keepdims=True)/m
    dA_tf.append(dAL)
    dZ_tf.append(dZ_temp)
    dW_tf.append(dW_temp)
    db_tf.append(db_temp)

    # For all other layers
    for i in reversed(range(1, L)):
        dA_temp = tf.matmul(tf.transpose(W_tf[i+1]),dZ_tf[-1])	
        dZ_temp = tf.identity(dA_temp)                     # Backward pass for ReLU, step 1: copy array
        dZ_temp = tf.where(Z_tf[i] > 0, dZ_temp, [0])
        dW_temp = tf.matmul(dZ_temp, tf.transpose(A_tf[i-1])) / m
        db_temp = tf.math.reduce_sum(dZ_temp, axis=1, keepdims=True)/m
        dA_tf.append(dA_temp)
        dZ_tf.append(dZ_temp)
        dW_tf.append(dW_temp)
        db_tf.append(db_temp)

    # Reverse the list so that indexes are respective to layer number.
    # This is so as for backward pass, all calculations are done from last layer to first layer
    dA_tf.append(0)    # Append addition 0 so that after reverse, dA0 correspond to A0 with same dimension
    dZ_tf.append(0)
    dW_tf.append(0)
    db_tf.append(0)
    dA_tf = list(reversed(dA_tf))
    dZ_tf = list(reversed(dZ_tf))
    dW_tf = list(reversed(dW_tf))
    db_tf = list(reversed(db_tf))

    print(dW_tf[2], '\n', dW_torch[2], '\n', dW_numpy[2])
    print(A_tf[0], '\n', 'A_torch[0]', '\n', A_numpy[0])
    
    return dA_numpy, dZ_numpy, dW_numpy, db_numpy, \
            dA_torch, dZ_torch, dW_torch, db_torch, \
            dA_tf, dZ_tf, dW_tf, db_tf

In [None]:
def update_params(dW_numpy, db_numpy, dW_torch, db_torch, dW_tf, db_tf, W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf):
    # Update parameters
    learn_rate = 0.05

    # Numpy update
    for i in range(1,L+1):
        W_numpy[i] = W_numpy[i] - learn_rate*dW_numpy[i]
        b_numpy[i] = b_numpy[i] - learn_rate*db_numpy[i]

    # Pytorch update
    for i in range(1,L+1):
        W_torch[i] = W_torch[i] - learn_rate*dW_torch[i]
        b_torch[i] = b_torch[i] - learn_rate*db_torch[i]

    # Tensorflow update
    for i in range(1,L+1):
        W_tf[i] = W_tf[i] - learn_rate*dW_tf[i]
        b_tf[i] = b_tf[i] - learn_rate*db_tf[i]
        
    return W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf

In [None]:
def predict_accuracy(A_numpy, A_torch, A_tf, Y_numpy, Y_torch, Y_tf):
    # Predict output using the output of the last activation

    # Numpy prediction
    predict_numpy = A_numpy[L].round()
    accuracy_numpy = np.count_nonzero(predict_numpy == Y_numpy) / Y_numpy.shape[1] * 100

    # Pytorch prediction
    predict_torch = A_torch[L].round()
    accuracy_torch = torch.sum(predict_torch == Y_torch) / float(Y_torch.shape[1]) * 100

    # Tensorflow prediction
    predict_tf = tf.math.round(A_tf[L])
    accuracy_tf = tf.reduce_sum(tf.cast(predict_tf == Y_tf, tf.float32)) / Y_tf.shape[1] * 100

    print('Accuracy of numpy, pytorch and tensorflow prediction are {}%, {}% and {}%respectively\n'.format(accuracy_numpy, accuracy_torch, accuracy_tf))

In [None]:
EPOCH = 200
for i in range(1, EPOCH+1):
    print('Epoch {}'.format(i))
    Z_numpy, A_numpy, Z_torch, A_torch, Z_tf, A_tf = forward_pass(X, W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf, Y, n_l, L, m)
    dA_numpy, dZ_numpy, dW_numpy, db_numpy, dA_torch, dZ_torch, dW_torch, db_torch, dA_tf, dZ_tf, dW_tf, db_tf = backward_pass(Z_numpy, A_numpy, Z_torch, A_torch, Z_tf, A_tf, Y_numpy, Y_torch, Y_tf, L, m)
    W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf = update_params(dW_numpy, db_numpy, dW_torch, db_torch, dW_tf, db_tf, W_numpy, b_numpy, W_torch, b_torch, W_tf, b_tf)
    predict_accuracy(A_numpy, A_torch, A_tf, Y_numpy, Y_torch, Y_tf)