In [157]:
import idx2numpy
import numpy as np

""" LOADING DATASET """
train_images_file = 'assets/MNIST/train-images.idx3-ubyte'
train_images_array = idx2numpy.convert_from_file(train_images_file).T

train_labels_file = 'assets/MNIST/train-labels.idx1-ubyte'
train_labels_array = idx2numpy.convert_from_file(train_labels_file).T

test_images_file = 'assets/MNIST/test-images.idx3-ubyte'
test_images_array = idx2numpy.convert_from_file(test_images_file).T

test_labels_file = 'assets/MNIST/test-labels.idx1-ubyte'
test_labels_array = idx2numpy.convert_from_file(test_labels_file).T

print(f"Size of an image in each row: {train_images_array.shape[:2]}")
print(f"Number of training examples: {train_images_array.shape[-1]}")

Size of an image in each row: (28, 28)
Number of training examples: 60000


In [158]:
X = train_images_array.reshape(
    (-1, train_images_array.shape[-1])
)

Y = train_labels_array.reshape(1, -1)

print(f"Training array: {X.shape} \nTraining labels: {Y.shape}")

Training array: (784, 60000) 
Training labels: (1, 60000)


In [159]:
""" UTILS """
def g_h(z):  # for hidden layers
    """
    Try:
    1. sigmoid
    2. tanh
    3. ReLU (using now)
    4. Leaky ReLU
    """
    return np.maximum(0, z)

def g_y(z):
    return 1 / (1 + np.exp(-z))

In [160]:
def initialise_parameters(n_x, n_h1, n_h2, n_y):
    np.random.seed(69)

    W1 = np.random.uniform(-0.01, 0.01, (n_h1, n_x))
    b1 = np.zeros((n_h1, 1))

    W2 = np.random.uniform(-0.01, 0.01, (n_h2, n_h1))
    b2 = np.zeros((n_h2, 1))

    W3 = np.random.uniform(-0.01, 0.01, (n_y, n_h2))
    b3 = np.zeros((n_y, 1))

    parameters = {
        "W1": W1,
        "b1": b1,

        "W2": W2,
        "b2": b2,

        "W3": W3,
        "b3": b3
    }

    return parameters

In [161]:
def layer_sizes(X, Y):
    n_x = X.shape[0]
    n_h1 = 16
    n_h2 = 16
    n_y  = 10
    
    return (n_x, n_h1, n_h2, n_y)

In [162]:
def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']

    W2 = parameters['W2']
    b2 = parameters['b2']
    
    W3 = parameters['W3']
    b3 = parameters['b3']

    Z1 = np.dot(W1, X) + b1
    A1 = g_h(Z1)
    
    Z2 = np.dot(W2, A1) + b2
    A2 = g_h(Z2)
    
    Z3 = np.dot(W3, A2) + b3
    A3 = g_y(Z3)

    # Caching for backpragation
    cache = {
        "Z1": Z1,
        "A1": A1,
        
        "Z2": Z2,
        "A2": A2,
        
        "Z3": Z3,
        "A3": A3
    }

    return A3, cache

In [163]:
def compute_cost(A3, Y, parameters):
    m = Y.shape[1]
    epsilon = 1e-8  # Small constant to avoid division by zero
    cost = -np.sum(Y * np.log(A3 + epsilon) + (1 - Y) * np.log(1 - A3 + epsilon)) / m

    return cost


In [164]:
def backward_propagation(parameters, cache, X, Y):
    m = X.shape[1]
    
    W1 = parameters["W1"]
    b1 = parameters["b1"]

    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    W3 = parameters["W3"]
    b3 = parameters["b3"]


    Z1 = cache["Z1"]
    A1 = cache["A1"]

    Z2 = cache["Z2"]
    A2 = cache["A2"]
    
    Z3 = cache["Z3"]
    A3 = cache["A3"]
    

    dZ3 = A3 - Y
    dW3 = (1/m) * np.dot(dZ3, A2.T)
    db3 = (1/m) * np.sum(dZ3, axis=1, keepdims=True)

    dZ2 = np.dot(W3.T, dZ3) * (A2 > 0)
    dW2 = (1/m) * np.dot(dZ2, A1.T)
    db2 = (1/m) * np.sum(dZ2, axis=1, keepdims=True)

    dZ1 = np.dot(W2.T, dZ2) * (A1 > 0)
    dW1 = (1/m) * np.dot(dZ1, X.T)
    db1 = (1/m) * np.sum(dZ1, axis=1, keepdims=True)

    grads = {
        "dW1": dW1,
        "db1": db1,

        "dW2": dW2,
        "db2": db2,
        
        "dW3": dW3,
        "db3": db3
    }
    
    return grads

In [165]:
def update_parameters(parameters, grads, lr):
    W1 = parameters["W1"]
    b1 = parameters["b1"]

    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    W3 = parameters["W3"]
    b3 = parameters["b3"]
    

    dW1 = grads["dW1"]
    db1 = grads["db1"]

    dW2 = grads["dW2"]
    db2 = grads["db2"]
    
    dW3 = grads["dW3"]
    db3 = grads["db3"]
    
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1

    W2 = W2 - lr * dW2
    b2 = b2 - lr * db2
    
    W3 = W3 - lr * dW3
    b3 = b3 - lr * db3

    parameters = {
        "W1": W1,
        "b1": b1,

        "W2": W2,
        "b2": b2,

        "W3": W3,
        "b3": b3
    }
    
    return parameters

## Integrating into a `nn_model`

In [166]:
def nn_model(X, Y, n_h1, n_h2, lr, max_iterations=10000, print_cost=False):
    n_x = layer_sizes(X, Y)[0]
    n_y = layer_sizes(X, Y)[-1]
    
    parameters = initialise_parameters(n_x, n_h1, n_h2, n_y)
    
    W1 = parameters['W1']
    b1 = parameters['b1']

    W2 = parameters['W2']
    b2 = parameters['b2']
    
    W3 = parameters['W3']
    b3 = parameters['b3']
    
    # Gradient descent
    for i in range(0, max_iterations):
        # Forward propagation
        A3, cache = forward_propagation(X, parameters)  # cache === {Z1, A1, ...}

        # Cost function
        cost = compute_cost(A3, Y, parameters)

        # Backpropagation
        grads = backward_propagation(parameters, cache, X, Y)

        # Update rule for each parameter
        parameters = update_parameters(parameters, grads, lr)

        if print_cost==True and i % 100 == 0:
            print ("Cost after iteration %i: %f" %(i, cost))
            
    return parameters

In [167]:
def predict(parameters, X):
    A3, cache = forward_propagation(X, parameters)
    predictions = A3.copy() #np.where(A3 > 0.5, 1, 0)
    
    return predictions

In [168]:
parameters = nn_model(X, Y, n_h1=16, n_h2=16, lr=0.01, max_iterations=10_000, print_cost=True)
predictions = predict(parameters, X)
accuracy = float((np.dot(Y,predictions.T) + np.dot(1-Y,1-predictions.T))/float(Y.size)*100)
print (f"Accuracy = {accuracy}")

Cost after iteration 0: 1406.201198
