In this notebook, we will implement forward and backward propogation functions for a multi layered neural from scratch in Pytorch.

In [1]:
import torch

### Forward propagation
Forward Propagation, for arbitrary layer $l \in \left\lbrace 0, L \right\rbrace$: 
$$\vec{z}^{\left(l\right)} = W^{\left(l\right)}  \vec{a}^{\left(l-1\right)} + \vec{b}^{\left(l\right)}$$ 
$$\vec{a}^{\left(l\right)} = \sigma\left( \vec{z}^{\left(l\right)} \right)$$

In [2]:
def Z(x, W_l, b_l):
    """
    Args
        x: 1-d vector. Activation of layer l-1 
        Wl: Weight matrix of layer l
        bl: Bias of layer l
    """
    return torch.matmul(W_l, x) + b_l

def A(z_l):
    """
    Sigmoid activation function (non-linear layer)
    """
    return torch.sigmoid(z_l)

def forward(x, W, b):
    """
    In the forward pass, we loop over every single layer, and perform forward propagation as
    defined by the equation above
    Args
        x: 1-d input vector. Represents a single training data instance
        W: List of weight matrices. From 0 to L
        b: List of bias vectors. From 0 to L
    """
    L = len(W) - 1
    a_l = x
    for l in range(0, L + 1):
        z_l = Z(a_l, W[l], b[l])
        a_l = A(z_l)
    return a_l

### Loss
Here we are working with a single training data instance, $x_{i}$ whose GT output is $\bar{y}_{i}$. 

$\mathbb{L} = \frac{1}{2} \left( a^{ \left( L \right) } - \bar{y}_{i} \right)^{2}$


In [3]:
def mse_loss(a_L, y):
    """
    Args
        a_L: Activation of the last layer
        y: Ground Truth
    """
    return 1./ 2 * torch.pow((a_L - y), 2)

### Backpropagation

Backpropagation, for last layer $L$ 

$$\vec{\delta}^{\left( L \right)} = \left( \vec{a}^{ \left( L \right) }  - \bar{y} \right) \circ \vec{a}^{\left( L \right)} \circ \left( \vec{1} - \vec{a}^{\left( L \right)}  \right)$$

$$\nabla_{ W^{ \left( L \right) } } \mathbb{L} = \vec{ \delta }^{ \left( L \right) } \left( \vec{ a }^{ \left( L - 1 \right) } \right)^{T}$$

$$\nabla_{ b^{ \left( L \right) } } \mathbb{L} = \vec{ \delta }^{ \left( L \right) }$$


Backpropagation, for arbitrary layer $l \in \left\lbrace 0, L-1 \right\rbrace$: 

$$\vec{\delta}^{ \left( l \right) } = \left(\left(  W^{ \left( l+1 \right) } \right)^{T}  \vec{ \delta }^{ \left( l+1 \right) }\right) \circ \vec{a}^{ \left( l \right) } \circ \left( \vec{1} -  \vec{a}^{ \left( l \right) } \right)$$

$$\nabla_{ W^{ \left( l \right) } } \mathbb{L} = \vec{ \delta }^{ \left( l \right) } \left( \vec{ a }^{ \left( l - 1 \right) } \right)^{T}$$

$$\nabla_{ b^{ \left( l \right) } } \mathbb{L} = \vec{ \delta }^{ \left( l \right) }$$

In [4]:
def forward_backward(x, y, W, b):
    L = len(W) - 1
    a = []
    # We are caching the output of the forward propagation of the intermediate layers
    # to help with the calcuation of the gradients during backward propagation.
    for l in range(0, L+1):
        a_prev = x if l == 0 else a[l-1]
        z_l = Z(a_prev, W[l], b[l])
        a_l = A(z_l)
        a.append(a_l)

    print('Final activation', a[L])
    loss = mse_loss(a[L], y)
    print('Loss', loss)
    
    deltas = [None for _ in range(L + 1)]
    W_grads = [None for _ in range(L + 1)]
    b_grads = [None for _ in range(L + 1)]
    
    # Compute for the last layer
    a_L = a[L]
    deltas[L] = (a_L - y) * a_L * (1 - a_L)
    W_grads[L] = torch.matmul(deltas[L], a[L - 1].T)
    b_grads[L] = deltas[L]

    for l in range(L-1, -1, -1):
        a_l = a[l]
        deltas[l] =  torch.matmul(W[l + 1].T, deltas[l + 1]) * a_l * (1 - a_l)
        W_grads[l] = torch.matmul(deltas[l], a[l - 1].T)
        b_grads[l] = deltas[l]
        
    for l in range(0, L + 1):
        print('Layer: {}, Shapes - W: {}, W_grad: {}, b: {}, b_grad: {}, delta: {}'.format(
                l, list(W[l].shape), list(W_grads[l].shape), 
                list(b[l].shape), list(b_grads[l].shape), list(deltas[l].shape)
        ))
    return loss, W_grads, b_grads

In [5]:
x = torch.tensor([1, 2, 3, 4, 5, 6, 7], dtype=torch.float32).unsqueeze(dim=1)
y = torch.tensor(7.9, dtype=torch.float32)

In [6]:
x = torch.randn(7, 1)
y = torch.randn(1, 1)
w0 = torch.randn(5,7)
b0 = torch.randn(5,1)
w1 = torch.randn(3,5)
b1 = torch.randn(3,1)
w2 = torch.randn(1,3)
b2 = torch.randn(1,1)

loss, W_grads, b_grads = forward_backward(x, y, [w0, w1, w2], [b0, b1, b2])

Final activation tensor([[0.0739]])
Loss tensor([[0.1933]])
Layer: 0, Shapes - W: [5, 7], W_grad: [5, 1], b: [5, 1], b_grad: [5, 1], delta: [5, 1]
Layer: 1, Shapes - W: [3, 5], W_grad: [3, 5], b: [3, 1], b_grad: [3, 1], delta: [3, 1]
Layer: 2, Shapes - W: [1, 3], W_grad: [1, 3], b: [1, 1], b_grad: [1, 1], delta: [1, 1]
