In [3]:
import idx2numpy
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

x_train = idx2numpy.convert_from_file('data/train-images-idx3-ubyte').T.reshape(-1,60000)
y_train = idx2numpy.convert_from_file('data/train-labels-idx1-ubyte').reshape(-1,1).T

x_test = idx2numpy.convert_from_file('data/t10k-images-idx3-ubyte').T.reshape(-1,10000)
y_test = idx2numpy.convert_from_file('data/t10k-labels-idx1-ubyte').reshape(-1,1).T

y_train = tf.one_hot(y_train, 10, axis=1).numpy().reshape(10,-1)
y_test = tf.one_hot(y_test, 10, axis=1).numpy().reshape(10,-1)

x_train, x_test = x_train / 255.0, x_test/255.0

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(784, 60000) (784, 10000) (10, 60000) (10, 10000)


In [4]:
layer_dims = [x_train.shape[0], 512, 10]

def init_params():
    weights = {}
    biases = {}
    for l in range(1, len(layer_dims)):
        weights[l] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01;
        biases[l] = np.zeros((layer_dims[l],1))
    return weights,biases

In [37]:
def relu(z):
    return np.maximum(0,z)

# 10 x m input
def softmax(z):
    exp = np.exp(z)
    return exp/np.sum(exp, axis=0)

# 10 x m vs 10 x m
def compute_cost(Y_expected, Y_actual):
    m = Y_expected.shape[1]
    return -1/m * np.sum(np.multiply(Y_expected, np.log(Y_actual)))

def forward_prop(X, W, B):
    Z = {}
    A = {0: X}
    cost = 0
    for l in range (1, len(layer_dims)):
        Z[l] = np.dot(W[l] , A[l-1]) + B[l]
        if l == len(layer_dims) -1:
            A[l] = softmax(Z[l])
        else:
            A[l] = relu(Z[l])
    return Z, A

def calc_accuracy(Y_expected, Y_actual):
    #print(Y_actual.shape)
    #print(Y_actual[:,:30])
    Y_actual = Y_actual.astype(int)
    Y_expected = Y_expected.astype(int)
    return 1.0 * np.sum(np.bitwise_and(Y_expected, Y_actual))/Y_expected.shape[1]

def backward_prop_update_weights(Z, A, Y, W, B, learning_rate):
    m = Y.shape[1]
    #derivation flowing into the activation function (starts with 1 because dL/dL = 1)
    propogated_derv = 1
    activation_derv = 0
    
    for l in range(len(layer_dims) -1, 0 , -1):
        if l == len(layer_dims)-1:
            ##softmax derivative (Y_actual - Y_expected)
            activation_derv = A[l] - Y
        else:
            ##relu derivative
            activation_derv = (Z[l] > 0) * 1
            
        dZ = np.multiply(propogated_derv, activation_derv)
        dW = 1/m * np.dot(dZ, A[l-1].T)
        dB = 1/m * (np.sum(dZ,axis=1, keepdims=True))
        W[l] = W[l] - learning_rate * dW
        B[l] = B[l] - learning_rate * dB
        propogated_derv = np.dot(W[l].T, dZ)
    return W, B

def train(X, Y, numitrs, learning_rate, batch_size):
    W, B = init_params()
    for itrs in range(numitrs):
        print(itrs)
        for batch in range(X.shape[1]//batch_size):
            #print("Processing batch " + str(batch*batch_size) + " to " + str((batch+1) * batch_size))
            bx = X[:, batch*batch_size: (batch + 1) * batch_size]
            by = Y[:, batch*batch_size: (batch + 1) * batch_size]
            Z, A = forward_prop(bx, W, B)
            W, B = backward_prop_update_weights(Z , A, by, W, B, learning_rate)
        print(itrs)
        l = len(layer_dims) -1
        cost = compute_cost(by, A[l])
        accuracy = calc_accuracy(by, (A[l] > 0.5) * 1)
        print("Cost :" + str(cost) + " Accuracy: " + str(accuracy))
    return W, B

def predict(X, W, B):
    Z, A = forward_prop(X, W, B)
    return A[len(layer_dims)-1]

In [39]:
W , B = train(x_train, y_train, 100, 0.08, 100)

0
0
Cost :0.3836247840975446 Accuracy: 0.9
1
1
Cost :0.33441928975969987 Accuracy: 0.92
2
2
Cost :0.30041717680611063 Accuracy: 0.93
3
3
Cost :0.27547542077399684 Accuracy: 0.94
4
4
Cost :0.25878449484155414 Accuracy: 0.95
5
5
Cost :0.2478302805992173 Accuracy: 0.95
6
6
Cost :0.24072400845588893 Accuracy: 0.96
7
7
Cost :0.23639207379240418 Accuracy: 0.96
8
8
Cost :0.23201701299436273 Accuracy: 0.95
9
9
Cost :0.22984127531076864 Accuracy: 0.95
10
10
Cost :0.2286130292087084 Accuracy: 0.95
11
11
Cost :0.22789958329537974 Accuracy: 0.95
12
12
Cost :0.2268211767864858 Accuracy: 0.95
13
13
Cost :0.22578260282167675 Accuracy: 0.95
14
14
Cost :0.2244612042517427 Accuracy: 0.95
15
15
Cost :0.2228200367531245 Accuracy: 0.95
16
16
Cost :0.2210046265052963 Accuracy: 0.95
17
17
Cost :0.2194166512206504 Accuracy: 0.96
18
18
Cost :0.21716223721655886 Accuracy: 0.97
19
19
Cost :0.21524361819021542 Accuracy: 0.97
20
20
Cost :0.21250224543993718 Accuracy: 0.97
21
21
Cost :0.2100880598503882 Accuracy: 0

In [41]:
output = predict(x_test, W, B)
print("Accuracy on test set is " + str(calc_accuracy(y_test, (output > 0.5) * 1.0)))

Accuracy on test set is 0.9799
