## Handwritten Digit Recognition ##

In [348]:
import numpy as np
import pandas as pd

### Data ###

In [349]:
df = pd.read_csv('train.csv') # Data is "MNIST Digit Recognizer" by ANIMATRONBOT from Kaggle
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [350]:
df = np.array(df)
m, n = df.shape
np.random.shuffle(df)

test_cv_split = 0.1
test, cv = (m*test_cv_split, 2*m*test_cv_split)

df_test = df[:int(test)].T
y_test = df_test[0]
X_test = df_test[1:] / 255.

df_cv = df[int(test):int(cv)].T
y_cv = df_cv[0]
X_cv = df_cv[1:] / 255.

df_train = df[int(cv):].T
y_train = df_train[0]
X_train = df_train[1:] / 255.

### Activation Functions ###

In [351]:
def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A

def ReLU(Z):
    return np.maximum(0, Z)

def ReLU_derivative(Z):
    return Z > 0

### Generate Parameters W, b ###

In [352]:
def generate(a, b, threshold=0.5):
    arr = np.random.randn(a, b)
    arr[arr > threshold] = threshold
    arr[arr < -threshold] = -threshold
    return arr

In [353]:
def params(X):
    W1 = generate(25, n-1) # First hidden layer has 25 neurons
    b1 = generate(25, 1)
    W2 = generate(15, 25) # Second hidden layer has 15 neurons
    b2 = generate(15, 1)
    W3 = generate(10, 15) # Output layer
    b3 = generate(10, 1)
    return W1, b1, W2, b2, W3, b3

### One Hot ###

In [354]:
def one_hot(y):
    one_hot = np.zeros((y.size, y.max() + 1))
    one_hot[np.arange(y.size), y] = 1
    return one_hot.T

### Forward Backward Propagation ###

In [355]:
def forward(X, W1, b1, W2, b2, W3, b3, output=False):
    Z1 = np.dot(W1, X) + b1
    A1 = ReLU(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = ReLU(Z2)
    Z3 = np.dot(W3, A2) + b3
    A3 = softmax(Z3)
    if output:
        return A3
    else:
        return Z1, A1, Z2, A2, Z3, A3

In [356]:
def backward(X, y, Z1, A1, Z2, A2, Z3, A3, W1, b1, W2, b2, W3, b3):
    one_hot_y = one_hot(y)
    m = X.shape[1]
    
    dZ3 = A3 - one_hot_y
    dW3 = 1 / m * np.dot(dZ3, A2.T)
    db3 = 1 / m * np.sum(dZ3)
    
    dA2 = np.dot(W3.T, dZ3)
    dZ2 = dA2 * ReLU_derivative(Z2)
    dW2 = 1 / m * np.dot(dZ2, A1.T)
    db2 = 1 / m * np.sum(dZ2)
    
    dA1 = np.dot(W2.T, dZ2)
    dZ1 = dA1 * ReLU_derivative(Z1)
    dW1 = 1 / m * np.dot(dZ1, X.T)
    db1 = 1 / m * np.sum(dZ1)
    
    return dW1, db1, dW2, db2, dW3, db3

### Gradient Descent ###

In [366]:
def gradient_descent(X, y, alpha, iterations):
    W1, b1, W2, b2, W3, b3 = params(X)
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward(X, W1, b1, W2, b2, W3, b3)
        dW1, db1, dW2, db2, dW3, db3 = backward(X, y, Z1, A1, Z2, A2, Z3, A3, W1, b1, W2, b2, W3, b3)
        W1, W2, W3 = (W1 - alpha * dW1, W2 - alpha * dW2, W3 - alpha * dW3)
        b1, b2, b3 = (b1 - alpha * db1, b2 - alpha * db2, b3 - alpha * db3)
        if i % (iterations/10) == 0:
            predictions = np.argmax(A3, 0)
            accuracy = np.sum(predictions == y) / y.size
            print("Iteration: ", i)
            print(f"Accuracy: {accuracy}")
    print("Iteration: ", iterations)
    print(f"Accuracy: {accuracy}")
    return W1, b1, W2, b2, W3, b3

In [370]:
iterations = 2000
alpha = 0.1

W1, b1, W2, b2, W3, b3 = gradient_descent(X_train, y_train, alpha, iterations)

Iteration:  0
Accuracy: 0.10357142857142858
Iteration:  200
Accuracy: 0.59125
Iteration:  400
Accuracy: 0.7619642857142858
Iteration:  600
Accuracy: 0.8148511904761905
Iteration:  800
Accuracy: 0.8433630952380953
Iteration:  1000
Accuracy: 0.8631547619047619
Iteration:  1200
Accuracy: 0.876547619047619
Iteration:  1400
Accuracy: 0.88625
Iteration:  1600
Accuracy: 0.894077380952381
Iteration:  1800
Accuracy: 0.9000297619047619
Iteration:  2000
Accuracy: 0.9000297619047619


In [371]:
def accuracy(X, y, W1, b1, W2, b2, W3, b3):
    A3 = forward(X, W1, b1, W2, b2, W3, b3, output=True)
    predictions = np.argmax(A3, 0)
    accuracy = np.sum(predictions == y) / y.size
    return accuracy

In [372]:
accuracy(X_test, y_test, W1, b1, W2, b2, W3, b3)

0.8942857142857142

In [373]:
accuracy(X_cv, y_cv, W1, b1, W2, b2, W3, b3)

0.8964285714285715