Simple implementation of a three layer neural network.

In [32]:
import numpy as np
import random

# Initialization of the network paratmeters input X, and labels y.
np.random.seed(231)
N, D, H1, H2, C = 50, 3072, 100, 100, 10
X = np.random.randn(N, D)
y = np.random.randint(C, size=(N,))

# Initialization of the layer parameters weights and biases.
weight_scale = 5e-2
W1 = weight_scale * np.random.randn(D, H1)
b1 = weight_scale * np.zeros(H1)
W2 = weight_scale * np.random.randn(H1, H2)
b2  = weight_scale * np.zeros(H2)
W3 = weight_scale * np.random.randn(H2, C)
b3 = weight_scale * np.zeros(C)

# Network architecture: (affine - relu) - (affine - relu) - (affine - softmax)
class ThreeLayerNet:
    def __init__(self):
        pass

    def affine_forward(self, x, w, b):
        out = x.reshape(x.shape[0], -1).dot(w) + b
        cache = (x, w, b)
        return out, cache
    
    def affine_backward(self, dout, cache):
        x, w, b = cache
        dx = dout.dot(w.T)
        dw = x.T.dot(dout)
        db = np.sum(dout, axis=0)
        return dx, dw, db
    
    def relu_forward(self, x):
        out = np.maximum(0, x)
        cache = x
        return out, cache
    
    def relu_backward(self, dout, cache):
        x = cache
        return dout * (x > 0)
    
    def softmax(self, x, y):
        exp_values = np.exp(x - np.max(x, axis=1, keepdims=True))
        softmax = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        correct_scores = np.zeros(x.shape)
        correct_scores[range(len(y)), y] = 1
        loss = np.mean(-np.log(np.sum(softmax * correct_scores, axis=1, keepdims=True)))
        dL = (softmax - correct_scores) / len(softmax)
        return loss, dL

In [33]:
# Forward pass.
nn = ThreeLayerNet()
H1, cache_H1 = nn.affine_forward(X, W1, b1)
R1, cache_R1 = nn.relu_forward(H1)
H2, cache_H2 = nn.affine_forward(R1, W2, b2)
R2, cache_R2 = nn.relu_forward(H2)
H3, cache_H3 = nn.affine_forward(R2, W3, b3)
loss, dL = nn.softmax(H3, y)
print(f'loss: {loss}')
print(f'the shape of the derivative of the loss wrt. the scores (H3): {dL.shape}')

loss: 2.421977830511722
the shape of the derivative of the loss wrt. the scores (H3): (50, 10)


In [34]:
# Backward pass.
grads = {}
dout3, grads['W3'], grads['b3'] = nn.affine_backward(dL, cache_H3)
print(f'the shape of the gradients of W3:', grads['W3'].shape)
print(f'the shape of the gradients of b3:', grads['b3'].shape)

drelu2 = nn.relu_backward(dout3, cache_R2)
print(f'the shape of the gradients of the second relu function:', drelu2.shape)

dout2, grads['W2'], grads['b2'] = nn.affine_backward(drelu2, cache_H2)
print(f'the shape of the gradients of W2:', grads['W2'].shape)
print(f'the shape of the gradients of b2:', grads['b2'].shape)

drelu1 = nn.relu_backward(dout2, cache_R1)
print(f'the shape of the gradients of the first relu function:', drelu1.shape)

dout1, grads['W1'], grads['b1'] = nn.affine_backward(drelu1, cache_H1)
print(f'the shape of the gradients of W1:', grads['W1'].shape)
print(f'the shape of the gradients of b2:', grads['b1'].shape)


the shape of the gradients of W3: (100, 10)
the shape of the gradients of b3: (10,)
the shape of the gradients of the second relu function: (50, 100)
the shape of the gradients of W2: (100, 100)
the shape of the gradients of b2: (100,)
the shape of the gradients of the first relu function: (50, 100)
the shape of the gradients of W1: (3072, 100)
the shape of the gradients of b2: (100,)
