Simple implementation of a five layer neural network.

In [71]:
import numpy as np
import random

# Initialization of the network parameters input X, and labels y.
np.random.seed(231)
H_dims = [100, 100, 100, 100]
N, D, C = 50, 3 * 32 * 32, 10
X = np.random.randn(N, D)
y = np.random.randint(C, size=(N,))

weight_scale = 5e-2
num_layers = len(H_dims) + 1
model_params = {}

# Weight and Biases initialization. Can handle an arbitrary number of hidden dimensions. 
for layer, hidden_dims in enumerate(H_dims):
    if layer == 0:
        model_params['W' + str(layer+1)] = weight_scale * np.random.randn(D, hidden_dims)
        model_params['b' + str(layer+1)] = np.zeros(hidden_dims)
    else:
        model_params['W' + str(layer+1)] = weight_scale * np.random.randn(hidden_dims, hidden_dims)
        model_params['b' + str(layer+1)] = np.zeros(hidden_dims)
model_params['W' + str(len(H_dims) + 1)] = weight_scale * np.random.randn(H_dims[-1], C)
model_params['b' + str(len(H_dims) + 1)] = np.zeros(C)

for p in model_params.keys():
    print(p, model_params[p].shape)

W1 (3072, 100)
b1 (100,)
W2 (100, 100)
b2 (100,)
W3 (100, 100)
b3 (100,)
W4 (100, 100)
b4 (100,)
W5 (100, 10)
b5 (10,)


In [72]:
# Network architecture: In this case: Five Layer Net.
#(affine-relu) -> (affine-relu) -> (affine-relu) -> (affine-relu) -> (affine-softmax).

class NeuralNet:
    def __init__(self):
        pass

    def affine_forward(self, x, w, b):
        out = x.reshape(x.shape[0], -1).dot(w) + b
        cache = (x, w, b)
        return out, cache
    
    def affine_backward(self, dout, cache):
        x, w, b = cache
        dx = dout.dot(w.T)
        dw = x.T.dot(dout)
        db = np.sum(dout, axis=0)
        return dx, dw, db

    def relu_forward(self, x):
        out = np.maximum(0, x)
        cache = x
        return out, cache
    
    def relu_backward(self, dout, cache):
        x = cache
        return dout * (x > 0)
    
    def softmax(self, x, y):
        exp_values = np.exp(x - np.max(x, axis=1, keepdims = True))
        softmax = exp_values / exp_values.sum(axis=1, keepdims=True)
        correct_scores = np.zeros(x.shape)
        correct_scores[range(len(y)), y] = 1
        loss = np.mean(-np.log(np.sum(softmax * correct_scores, axis=1, keepdims=True)))
        dL = (softmax - correct_scores) / len(softmax)
        return loss, dL

In [73]:
# Forward pass:
W1, b1, W2, b2, W3, b3, W4, b4, W5, b5 = model_params.values()

nn = NeuralNet()

H1, cache_H1 = nn.affine_forward(X, W1, b1)
R1, cache_R1 = nn.relu_forward(H1)
H2, cache_H2 = nn.affine_forward(R1, W2, b2)
R2, cache_R2 = nn.relu_forward(H2)
H3, cache_H3 = nn.affine_forward(R2, W3, b3)
R3, cache_R3 = nn.relu_forward(H3)
H4, cache_H4 = nn.affine_forward(R3, W4, b4)
R4, cache_R4 = nn.relu_forward(H4)
H5, cache_H5 = nn.affine_forward(R4, W5, b5)

loss, dL = nn.softmax(H5, y)
print(f'loss : {loss}')
print(f'the shape of the gradients of the loss wrt. the scores (H5): {dL.shape}')

loss : 2.301195972715804
the shape of the gradients of the loss wrt. the scores (H5): (50, 10)


In [74]:
'''Backward pass:
    We will compute the gradients of the weights and biases for each layer, as well as the gradient of x (dout).
    We will do the chain rule of the dout with the derivatve of the activation function (relu).
    This result is used to backpropagate the gradients to the precceeding layer.
    When we reach the end layer (first layer), the gradient of x will not have to be multiplied by anything else.
     as this is the end of the network.
'''

grads = {}
# Gradients of the 5th layer: (W5, b5, and X5, which is R4).
dout5, grads['W5'], grads['b5'] = nn.affine_backward(dL, cache_H5)
print(f'the shape of the gradients of W5 and b5 respectively: ', grads['W5'].shape, grads['b5'].shape)
print(f'the shape of the gradients of dout5, which is used to backprop through relu/previous layer:', dout5.shape)

# The gradient of the 4th relu activation function. 
drelu4 = nn.relu_backward(dout5, cache_R4)

# Gradients of the 4th layer: (W4, b4, and X4, which is R3).
dout4, grads['W4'], grads['b4'] = nn.affine_backward(drelu4, cache_H4)
print(f'the shape of the gradients of W4 and b4 respectively: ', grads['W4'].shape, grads['b4'].shape)
print(f'the shape of the gradients of dout4, which is used to backprop through relu/previous layer:', dout4.shape)

# The gradient of the 3th relu activation function. 
drelu3 = nn.relu_backward(dout4, cache_R3)

# Gradients of the 3rd layer: (W3, b3, and X3, which is R2).
dout3, grads['W3'], grads['b3'] = nn.affine_backward(drelu3, cache_H3)
print(f'the shape of the gradients of W3 and b3 respectively: ', grads['W3'].shape, grads['b3'].shape)
print(f'the shape of the gradients of dout3, which is used to backprop through relu/previous layer:', dout3.shape)

# The gradient of the 2th relu activation function. 
drelu2 = nn.relu_backward(dout3, cache_R2)

# Gradients of the 2th layer: (W2, b2, and X2, which is R1).
dout2, grads['W2'], grads['b2'] = nn.affine_backward(drelu2, cache_H2)
print(f'the shape of the gradients of W2 and b2 respectively: ', grads['W2'].shape, grads['b2'].shape)
print(f'the shape of the gradients of dout2, which is used to backprop through relu/previous layer:', dout2.shape)

# The gradient of the 1st relu activation function. 
drelu1 = nn.relu_backward(dout2, cache_R1)

# Gradients of the 1st layer: (W1, b1, and X1, which is R1).
dout1, grads['W1'], grads['b1'] = nn.affine_backward(drelu1, cache_H1)
print(f'the shape of the gradients of W1 and b1 respectively: ', grads['W1'].shape, grads['b1'].shape)
print(f'the shape of the gradients of dout1, which is just the gradients of the original input X:', dout1.shape)


the shape of the gradients of W5 and b5 respectively:  (100, 10) (10,)
the shape of the gradients of dout5, which is used to backprop through relu/previous layer: (50, 100)
the shape of the gradients of W4 and b4 respectively:  (100, 100) (100,)
the shape of the gradients of dout4, which is used to backprop through relu/previous layer: (50, 100)
the shape of the gradients of W3 and b3 respectively:  (100, 100) (100,)
the shape of the gradients of dout3, which is used to backprop through relu/previous layer: (50, 100)
the shape of the gradients of W2 and b2 respectively:  (100, 100) (100,)
the shape of the gradients of dout2, which is used to backprop through relu/previous layer: (50, 100)
the shape of the gradients of W1 and b1 respectively:  (3072, 100) (100,)
the shape of the gradients of dout1, which is just the gradients of the original input X: (50, 3072)
