In [116]:
import idx2numpy
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

x_train = idx2numpy.convert_from_file('data/train-images-idx3-ubyte').T.reshape(-1,60000)
y_train = idx2numpy.convert_from_file('data/train-labels-idx1-ubyte').reshape(-1,1).T

x_test = idx2numpy.convert_from_file('data/t10k-images-idx3-ubyte').T.reshape(-1,10000)
y_test = idx2numpy.convert_from_file('data/t10k-labels-idx1-ubyte').reshape(-1,1).T

y_train = tf.one_hot(y_train, 10, axis=1).numpy().reshape(10,-1)
y_test = tf.one_hot(y_test, 10, axis=1).numpy().reshape(10,-1)

x_train, x_test = x_train / 255.0, x_test/255.0

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(784, 60000) (784, 10000) (10, 60000) (10, 10000)


In [189]:
layer_dims = [x_train.shape[0], 512, 10]

def init_params():
    weights = {}
    biases = {}
    for l in range(1, len(layer_dims)):
        weights[l] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01;
        biases[l] = np.zeros((layer_dims[l],1))
    return weights,biases

In [190]:
def relu(z):
    return np.maximum(0,z)

# 10 x m input
def softmax(z):
    exp = np.exp(z)
    return exp/np.sum(exp, axis=0)

# 10 x m vs 10 x m
def compute_cost(Y_expected, Y_actual):
    m = Y_expected.shape[1]
    return -1/m * np.sum(np.multiply(Y_expected, np.log(Y_actual)))

def forward_prop(X, W, B):
    Z = {}
    A = {0: X}
    cost = 0
    for l in range (1, len(layer_dims)):
        Z[l] = np.dot(W[l] , A[l-1]) + B[l]
        if l == len(layer_dims) -1:
            A[l] = softmax(Z[l])
        else:
            A[l] = relu(Z[l])
    return Z, A

def calc_accuracy(Y_expected, Y_actual):
    #print(Y_actual.shape)
    #print(Y_actual[:,:30])
    Y_actual = Y_actual.astype(int)
    Y_expected = Y_expected.astype(int)
    return 1.0 * np.sum(np.bitwise_and(Y_expected, Y_actual))/Y_expected.shape[1]

def backward_prop_update_weights(Z, A, Y, W, B, learning_rate):
    m = Y.shape[1]
    #derivation flowing into the activation function (starts with 1 because dL/dL = 1)
    propogated_derv = 1
    activation_derv = 0
    
    for l in range(len(layer_dims) -1, 0 , -1):
        if l == len(layer_dims)-1:
            ##softmax derivative (Y_actual - Y_expected)
            activation_derv = A[l] - Y
        else:
            ##relu derivative
            activation_derv = (Z[l] > 0) * 1
            
        dZ = np.multiply(propogated_derv, activation_derv)
        dW = 1/m * np.dot(dZ, A[l-1].T)
        dB = 1/m * (np.sum(dZ,axis=1, keepdims=True))
        W[l] = W[l] - learning_rate * dW
        B[l] = B[l] - learning_rate * dB
        propogated_derv = np.dot(W[l].T, dZ)
    return W, B

def train(X, Y, numitrs, learning_rate):
    W, B = init_params()
    for itrs in range(numitrs):
        Z, A = forward_prop(X, W, B)
        W, B = backward_prop_update_weights(Z , A, Y, W, B, learning_rate)
        #print(itrs)
        if itrs % 50 == 0:
            print(itrs)
            l = len(layer_dims) -1
            cost = compute_cost(Y, A[l])
            #print("output : ", A[l][:,:10])
            accuracy = calc_accuracy(Y, (A[l] > 0.5) * 1)
            print("Cost :" + str(cost) + " Accuracy: " + str(accuracy))
    return W, B

def predict(X, W, B):
    Z, A = forward_prop(X, W, B)
    return A[len(layer_dims)-1]

In [198]:
W , B = train(x_train, y_train, 3000, 0.08)

0
Cost :2.302595236436559 Accuracy: 0.0
50
Cost :2.3016482461140035 Accuracy: 0.0
100
Cost :2.3010941713232587 Accuracy: 0.0
150
Cost :2.3006226441237008 Accuracy: 0.0
200
Cost :2.2999716138544333 Accuracy: 0.0
250
Cost :2.298621231221195 Accuracy: 0.0
300
Cost :2.2946044235920398 Accuracy: 0.0
350
Cost :2.27361432303316 Accuracy: 0.0
400
Cost :2.1355566632863883 Accuracy: 0.009466666666666667
450
Cost :1.9316166446652532 Accuracy: 0.047
500
Cost :1.617798209618591 Accuracy: 0.055483333333333336
550
Cost :1.2861195417544513 Accuracy: 0.18006666666666668
600
Cost :1.0428794681843452 Accuracy: 0.3081333333333333
650
Cost :0.7839691525063048 Accuracy: 0.5739666666666666
700
Cost :0.6814834049717768 Accuracy: 0.687
750
Cost :0.6339939534324646 Accuracy: 0.7329833333333333
800
Cost :0.6033976191098546 Accuracy: 0.7576666666666667
850
Cost :0.5799389874438813 Accuracy: 0.7743
900
Cost :0.5600572789418025 Accuracy: 0.7869833333333334
950
Cost :0.5418749728810147 Accuracy: 0.7967833333333333
1

In [199]:
output = predict(x_test, W, B)
print("Accuracy on test set is " + str(calc_accuracy(y_test, (output > 0.5) * 1.0)))

Accuracy on test set is 0.9422


In [176]:
print(output.shape)
a = (output[:,:5] > 0.5) * 1
print(a)

(10, 10000)
[[0 0 0 1 0]
 [0 0 1 0 0]
 [0 1 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [1 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]


In [177]:
y_op = idx2numpy.convert_from_file('data/t10k-labels-idx1-ubyte').reshape(-1,1).T

y_op[:,:5]

array([[7, 2, 1, 0, 4]], dtype=uint8)