In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('mnist_train.csv')

In this, I created an additional layer which uses Tanh and I increased the amount of neurons, just to see if the model would be better than the previous one. Things to learn: how to decide how many neurons and how deep to make this neural network? I just chose some random numbers and hoped the accuracy would improve. 

In [10]:
import random 
data = np.array(dataset)
random.shuffle(data)
print(data.shape)

(60000, 785)


In [11]:
test = data[:1000]

test = test.T

y_test = test[0]
x_test = test[1:]

train = data[1000:]
train = train.T
y_train = train[0]
x_train = train[1:]

print(y_train.shape, x_train.shape)


(59000,) (784, 59000)


In [51]:
#Forward prop
def init_params():
    w1 = np.random.rand(20, 784) - 0.5
    b1 = np.random.rand(20, 1) - 0.5
    w2 = np.random.rand(15, 20) - 0.5
    b2 = np.random.rand(15, 1) - 0.5
    w3 = np.random.rand(10, 15) - 0.5
    b3 = np.random.rand(10, 1) - 0.5

    return w1, b1, w2, b2, w3, b3

def ReLu(z):
    return np.maximum(0, z)

def softmax(z):
    val = (np.exp(z)) / (np.sum(np.exp(z), axis = 0))
    return val

def tanh(z):
    return np.tanh(z)

def forward_prop(w1, b1, w2, b2, w3, b3, X):
    z1 = np.dot(w1, X) + b1
    a1 = ReLu(z1)
    z2 = np.dot(w2, a1) + b2
    a2 = tanh(z2)
    z3 = np.dot(w3, a2) + b3
    a3 = softmax(z3)

    return (z1, a1, z2, a2, z3, a3)
def one_hot(Y):
    one_hot_y = np.zeros((10, Y.size))
    one_hot_y[Y, np.arange(Y.size)] = 1
    
    return one_hot_y
def deriv_ReLu(z):
    return z > 0

def deriv_tanh(z):
    return 1 - np.tanh(z) ** 2
def backprop(w1, b1, w2, b2, w3, b3, a1, z1, z2, z3, a2, a3, X, y):
    Y = one_hot(y)
    m = len(y)

    dz3 = a3 - Y
    dw3 = 1 / m * np.dot(dz3, a2.T)
    db3 = 1 / m * np.sum(dz3)
    dz2 = np.dot(w3.T, dz3) * deriv_tanh(z2)
    dw2 = 1 / m * np.dot(dz2, a1.T)
    db2 = 1 / m * np.sum(dz2)
    dz1 = np.dot(w2.T, dz2) * deriv_ReLu(z1)
    dw1 = 1 / m * np.dot(dz1, X.T)
    db1 = 1 / m * np.sum(dz1)

    return dw1, db1, dw2, db2, dw3, db3

def get_predictions(a3):
    return np.argmax(a3, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size


def update_weights(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha):
    w1 = w1 - alpha * dw1
    b1 = b1 - alpha * db1
    w2 = w2 - alpha * dw2
    b2 = b2 - alpha * db2
    w3 = w3 - alpha * dw3
    b3 = b3 - alpha * db3

    return w1, b1, w2, b2, w3, b3

def gradient_descent(X, Y, ITER, alpha):
    w1, b1, w2, b2, w3, b3 = init_params()
    for i in range(ITER):
        z1, a1, z2, a2, z3, a3 = forward_prop(w1, b1, w2, b2, w3, b3, X)
        dw1, db1, dw2, db2, dw3, db3 = backprop(w1, b1, w2, b2, w3, b3, a1, z1, z2, z3, a2, a3, X, Y)
        w1, b1, w2, b2, w3, b3 = update_weights(w1, b1, w2, b2, w3, b3, dw1, db1, dw2, db2, dw3, db3, alpha)

        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(a3)
            print(get_accuracy(predictions, Y))
    return w1, b1, w2, b2, w3, b3


In [55]:
w1, b1, w2, b2, w3, b3 = gradient_descent(x_train, y_train, 500, 0.3)

print("yay")



Iteration:  0
0.07659322033898305
Iteration:  10
0.24413559322033898
Iteration:  20
0.4308135593220339
Iteration:  30
0.5178644067796611
Iteration:  40
0.5803728813559322
Iteration:  50
0.6269152542372881
Iteration:  60
0.6607966101694915
Iteration:  70
0.6485762711864407
Iteration:  80
0.7105254237288136
Iteration:  90
0.7097796610169491
Iteration:  100
0.7485084745762712
Iteration:  110
0.7499830508474576
Iteration:  120
0.7612881355932204
Iteration:  130
0.7674237288135594
Iteration:  140
0.7833050847457627
Iteration:  150
0.785593220338983
Iteration:  160
0.7813728813559322
Iteration:  170
0.8035932203389831
Iteration:  180
0.807864406779661
Iteration:  190
0.8184237288135593
Iteration:  200
0.8129152542372882
Iteration:  210
0.8277627118644068
Iteration:  220
0.8319491525423729
Iteration:  230
0.8313050847457627
Iteration:  240
0.8334406779661017
Iteration:  250
0.8419830508474576
Iteration:  260
0.8412881355932204
Iteration:  270
0.8506779661016949
Iteration:  280
0.8342203389830

In [56]:
_, _, _, _, _, a3 = forward_prop(w1, b1, w2, b2, w3, b3, x_test)
pred = get_predictions(a3)
print(get_accuracy(pred, y_test))



0.89
