In [31]:
# packages
import numpy as np
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

In [32]:
# dataset import from mnist
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [33]:
# flatten the 28x28 matrix into 784 row for each picture, transpose so each pixel represents a row
x_train_flattened = x_train.reshape(-1, x_train.shape[0]) # -1 to unpivot 28x28, x_train.shape[0] = 60,000
x_test_flattened = x_test.reshape(-1, x_test.shape[0])

x_train_flattened = x_train_flattened / 255.
x_test_flattened = x_test_flattened / 255.

In [34]:
x_train_flattened.shape

(784, 60000)

In [39]:
# helper functions
def init_params():
    """
    Initialize parameters W1, B1, W2, B2
    """
    W1 = np.random.rand(10, 784) - 0.5 # subtract 0.5 to get numbers between -0.5 and 0.5
    b1 = np.random.rand(10, 1)
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1)

    return W1, b1, W2, b2

def ReLU(Z):
    """
    Reactivation Linear Unit
    If X > 0, X
    If X <= 0, 0
    """
    return np.maximum(0, Z)  # compares 0 to Z elementwise, so compares 0 to each element in matrix Z

def softmax(Z):
    """
    Softmax activation function
    e^[M] / sum(e^[M])
        where M is a matrix and this
        operation is performed elementwise
    """
    A = np.exp(Z - np.max(Z)) / np.sum(np.exp(Z - np.max(Z)), axis=0)
    return A

def forward_prop(W1, b1, W2, b2, X):
    
    # first layer
    Z1 = W1.dot(X) + b1  # first layer calculation -> weights dotproduct input X, add bias
    A1 = ReLU(Z1)  # activation function for first layer
    
    # second layer
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)

    return Z1, A1, Z2, A2

def one_hot(Y):
    one_hot_y = np.zeros((Y.size, Y.max() + 1))  # this creates a matrix of size Y rows by the max label + 1, so if 9 -> 10 columns total because there is 10 labels in 0-9
    one_hot_y[np.arange(Y.size), Y] = 1  # creates an M x Features matrix and assigns 1 to the correct label -> for each row, find the original label and assign 1 to that row, column
    one_hot_y = one_hot_y.T  # transpose so each label is a row

    return one_hot_y

def deriv_ReLU(Z):
    return Z > 0  # derivative of 1 = 1, 0 = 0 lol

def back_prop(Z1, A1, Z2, A2, W1, W2, X, Y):
    
    # encode labels
    m = Y.size
    one_hot_Y = one_hot(Y)

    # from output layer to last hidden layer
    dZ2 = 2*(A2 - one_hot_Y)
    dW2 = 1 / m * (dZ2.dot(A1.T))
    db2 = 1 / m * np.sum(dZ2, 1)

    # from last hidden layer to first hidden layer
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
    dW1 = 1 / m * (dZ1.dot(X.T))
    db1 = 1 / m * np.sum(dZ1, 1)

    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    """
    Take original values, subtract some learning rate * derivatives to arrive at a new value
    """
    W1 -= alpha * dW1
    b1 -= alpha * np.reshape(db1, (10, 1))
    W2 -= alpha * dW2
    b2 -= alpha * np.reshape(db2, (10, 1))
    return W1, b1, W2, b2

def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, iterations, alpha):
    
    # initialize parameters
    W1, b1, W2, b2 = init_params()

    # iterate through each epoch
    for i in range(iterations):
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i % 10 == 0:
            print(f"Iteration: {i}")
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))

    return W1, b1, W2, b2

In [40]:
W1, b1, W2, b2 = gradient_descent(x_train_flattened, y_train, 500, 0.1)

Iteration: 0
0.10051666666666667
Iteration: 10
0.10341666666666667
Iteration: 20
0.10535
Iteration: 30
0.10558333333333333
Iteration: 40
0.10695
Iteration: 50
0.10653333333333333
Iteration: 60
0.10636666666666666
Iteration: 70
0.10716666666666666
Iteration: 80
0.10788333333333333
Iteration: 90
0.10913333333333333
Iteration: 100
0.11


KeyboardInterrupt: 