In [1]:
import numpy as np
np.random.seed(42)

%run activations.ipynb
%run optimizers.ipynb

In [2]:
def variable_initialization(L, num_vars=13):
    lst = []
    for i in range(num_vars):
        lst.append({i: 0 for i in range(L + 1)})
    return lst

In [3]:
def weight_initialization(L, W, b):
    for i in range(1, L+1):
        W[i] = np.random.randn(n[i], n[i - 1]) * np.sqrt(2 / n[i - 1]) # xavier initialization
        b[i] = np.zeros((n[i], 1))
    return W, b

In [4]:
def forward_prop(L, W, b, Z, A, D, dropout_prob=1):
    for i in range(1, L+1):
        Z[i] = np.dot(W[i], A[i - 1]) + b[i]
        if i != L:
            A[i] = relu(Z[i])
        else:
            A[i] = softmax(Z[i])
        # dropout regularization
        D[i] = np.random.rand(A[i].shape[0], A[i].shape[1]) < dropout_prob
        A[i] = A[i] * D[i]
        if dropout_prob != 0:
            A[i] /= dropout_prob
    return Z, A

In [5]:
def back_prop(Y, L, m, W, Z, A, dW, db, dZ, dA, D, batch, batch_size, dropout_prob=1):
    for i in range(L, 0, -1):
        if i == L:
            dZ[i] = A[i] - Y[:, batch:batch + batch_size]
        else:
            dA[i] = np.dot(W[i+1].T, dZ[i+1])
            dA[i] *= D[i] # apply dropout mask
            if dropout_prob != 0:
                dA[i] /= dropout_prob # rescale for dropout
            dZ[i] = dA[i] * (Z[i] > 0)
        dW[i] = np.dot(dZ[i], A[i-1].T)/m
        db[i] = np.sum(dZ[i])/m 
    return dW, db

In [6]:
def l2_regularization(L, m, W, dW, lambd):
    for i in range(1, L + 1):
        dW[i] += (lambd/m) * W[i]
    return dW

In [7]:
def neural_network(X, Y, L, n, epochs, batch_size, optimizer="adam", alpha=0.1, beta1=0.9, beta2=0.999, lambd=0, dropout_prob=1):
    m = X.shape[1] # num training examples
    W, dW, b, db, Z, dZ, A, dA, D, vdW, vdb, sdW, sdb = variable_initialization(L) # init variables
    W, b = weight_initialization(L, W, b) # init weights, biases
    losses = [] # store loss values
    for epoch in range(epochs): # iterate through epochs
        A[0] = X_train # train set
        Z, A = forward_prop(L, W, b, Z, A, D) # forward prop
        loss = -np.sum(np.log(A[L] + 1e-8) * Y) / m # compute loss
        losses.append(loss) # cache loss value
        if epoch % (epochs // 10) == 0: # after 10% training progress
            print(f"Epoch {epoch}: {loss}") # display train loss
        for batch in range(0, m, batch_size): # iterate through batches
            A[0] = X_train[:, batch:batch + batch_size] # first batch
            Z, A = forward_prop(L, W, b, Z, A, D, dropout_prob) # forward prop
            dW, db = back_prop(Y, L, m, W, Z, A, dW, db, dZ, dA, D, batch, batch_size, dropout_prob) # back prop
            dW = l2_regularization(L, m, W, dW, lambd) # regularization
            if optimizer == "gradient_descent":
                W, b = gradient_descent(W, b, dW, db, alpha) # gradient descent
            else:
                W, b = adam(W, b, dW, db, vdW, vdb, sdW, sdb, alpha, beta1, beta2) # adam
    return W, b, losses