In [None]:
import numpy as np
import scipy.stats

In [1]:
## ReLU

def ReLU(X):
    return np.maximum(X, 0.0)

def dReLU(X):
    return (X > 0).astype('float32')

In [None]:
## Feed Forward with loop

def feed_forward(Ws, X, key=None, keep_prob=1):
    layers = [X] # input layer
    for W in Ws[:-1]:
        X = X @ W 
        layers.append(X)
        X = dropout(X, key, keep_prob=keep_prob)
        X = ReLU(X)
        layers.append(X)
    # readout layer
    X = X @ Ws[-1]
    layers.append(X)
    X = softmax(X) 
    layers.append(X)
    return layers

In [None]:
## Back propagation with loop

def back_prop(Ws, X, Y, keep_prob=1):
    layers = feed_forward(Ws, X, keep_prob=keep_prob) # X1, Z1, X2, Z2, Yhat
    dJdWs = []
    
    for i in range(len(Ws)):
        Z = layers.pop() # remove last layer from list
        if i == 0:
            # readout layer, Z=Yhat
            δ = Z - Y
            layers.pop() # remove last layer from list
        else:
            # hidden layers, Z = X @ W
            W = Ws[-i]
            δ = (δ @ W.T) * dReLU(Z) # δ = δ * W * ReLU(Z)
        X = layers.pop() # remove last layer from list
        dJdW = X.T @ δ # dC/dW = δ * X
        dJdWs.append(dJdW)
    
    # reverse list of gradients - it is currently from last to first
    dJdWs.reverse() 
    # sanity checks
    assert len(dJdWs) == len(Ws), (len(dJdWs), len(Ws))
    for dJdW, W in zip(dJdWs, Ws):
        assert dJdW.shape == W.shape, (dJdW.shape, W.shape)
    return dJdWs