In [1]:
from mnist import MNIST
import numpy as np
import cupy as cp
import random
from PIL import Image

In [2]:
mndata = MNIST('samples')
images, labels = mndata.load_training()

In [4]:
images = cp.asarray(images).reshape(60000, 784)

In [5]:
images_transpose = images.T

In [6]:
images_transpose.shape

(784, 60000)

In [7]:
labels = cp.asarray(labels).reshape(60000, 1)

In [8]:
#making labels from 60000x1 to 60000x9
labels_transpose = labels.T
labels_onehot = cp.zeros((labels_transpose.size, 10))
labels_onehot[cp.arange(labels_transpose.size),labels_transpose] = 1

In [9]:
labels_onehot_transpose = labels_onehot.T

In [10]:
labels_onehot_transpose.shape

(10, 60000)

In [11]:
#initializing weights 
def initialize_weights():
    w1 = cp.random.rand(784, 128)
    b1 = cp.random.rand(128, 1)
    w2 = cp.random.rand(128, 64)
    b2 = cp.random.rand(64, 1)
    w3 = cp.random.rand(64, 10)
    b3 = cp.random.rand(10, 1)
    return w1, w2, w3, b1, b2, b3

In [12]:
def sigmoid(x):
    return 1/(1 + cp.exp(-1*x))

In [13]:
def relu(x):
    x[x < 0] = 0
    return x

In [14]:
def softmax(x): 
    ans = cp.exp(x-cp.max(x, axis=0, keepdims=True))
    d = cp.sum(ans, axis = 0)
    return ans/d

In [15]:
def cost(Y, outputs, m):
    op = Y*cp.log(outputs)
    loss = -cp.sum(op,axis = 0, keepdims = True)
    cost = (1/m)*cp.sum(loss, axis = 1) 
    return cost

In [16]:
def forward_and_back_prop(inputs, labels, w1, w2, w3, b1, b2, b3, m):
    z1 = cp.dot(w1.T, inputs) + b1
    z1 = normalize_z(z1)
    a1 = relu(z1)
    z2 = cp.dot(w2.T, a1) + b2
    a2 = relu(z2)
    z3 = cp.dot(w3.T, a2) + b3
    a3 = softmax(z3)
    
    dz3 = a3 - labels#dz3 = dL/dz3
    dw3 = (1/64)*cp.dot(a2, dz3.T)
    db3 = (1/m)*(cp.sum(dz3, axis = 1, keepdims = True))
    
    da2 = cp.dot(w3, dz3)
    dz2 = da2
    dz2[dz2 < 0] = 0#to account for da2/dz2 in (da2/dz2)*(dL/da2)
    #dz2[dz2 >=0] = 1
    dw2 = (1/128)*cp.dot(a1, dz2.T)
    db2 = (1/m)*(cp.sum(dz2, axis = 1, keepdims = True))
    da1 = cp.dot(w2, dz2)
    dz1 = da1
    dz1[dz1 < 0] = 0
    #dz1[dz1 >=0] = 1
    dw1 = (1/784)*cp.dot(inputs, dz1.T)
    db1 = (1/m)*(cp.sum(dz1, axis = 1, keepdims = True))
    
    return dw1, dw2, dw3, db1, db2, db3, a3

In [17]:
labels_batches = cp.asarray(cp.split(labels_onehot_transpose, 1000, axis = 1))
labels_batches.shape

(1000, 10, 60)

In [18]:
images_batches = cp.asarray(cp.split(images_transpose, 1000, axis = 1))
images_batches.shape

(1000, 784, 60)

SyntaxError: invalid syntax (<ipython-input-19-02a5b0ae2f15>, line 1)

In [20]:
def normalize(x):
    epsilon = 0.001
    u = cp.sum(x, axis = 1, keepdims = True)
    variance = cp.sum(cp.square(x-u), axis = 1, keepdims = True)
    return (x-u)/(cp.sqrt(variance + epsilon))

In [21]:
def normalize_z(x):
    epsilon = 0.001
    u = cp.sum(x, axis = 0, keepdims = True)
    variance = cp.sum(cp.square(x-u), axis = 0, keepdims = True)
    return (x-u)/(cp.sqrt(variance + epsilon))

In [22]:
images_batches_normalized = normalize(images_batches)
images_batches_normalized.shape

(1000, 784, 60)

In [23]:
#gradient descent
random.seed(10)
w1, w2, w3, b1, b2, b3 = initialize_weights()
X = images_batches_normalized[:,:,0].T
Y = labels_batches[:,:,0].T
m = 1000
alpha = 0.001
iterations = 5000
for i in range(iterations):
    dw1, dw2, dw3, db1, db2, db3, a3 = forward_and_back_prop(X, Y, w1, w2, w3, b1, b2, b3, m)
    w1 = w1 - alpha*dw1
    w2 = w2 - alpha*dw2
    w3 = w3 - alpha*dw3
    b1 = b1 - db1
    b2 = b2 - db2
    b3 = b3 - db3
    if i%500 == 0:
        print("current progress: ", i, "cost: ", cost(Y, a3, m))

current progress:  0 cost:  [17.33933611]
current progress:  500 cost:  [2.29555517]
current progress:  1000 cost:  [2.29555517]
current progress:  1500 cost:  [2.29555517]
current progress:  2000 cost:  [2.29555517]
current progress:  2500 cost:  [2.29555517]
current progress:  3000 cost:  [2.29555517]
current progress:  3500 cost:  [2.29555517]
current progress:  4000 cost:  [2.29555517]
current progress:  4500 cost:  [2.29555517]


In [33]:
def accuracy(inputs, Y, w1, w2, w3, b1, b2, b3, m):
    
    z1 = cp.dot(w1.T, inputs) + b1
    z1 = normalize_z(z1)
    a1 = relu(z1)
    z2 = cp.dot(w2.T, a1) + b2
    a2 = relu(z2)
    z3 = cp.dot(w3.T, a2) + b3
    a3 = softmax(z3)
    
    res = np.zeros_like(a3)
    res[a3.argmax(0), np.arange(len(a3[0]))] = 1
    matched = cp.count_nonzero(cp.sum(abs(Y-res), axis=0))
    return ((matched/m))*100

In [25]:
#w1, w2, w3, b1, b2, b3 = initialize_weights()
m = 1000
alpha = 0.00001
iterations = 70
for i in range(iterations):
    for b in range(12):
        X = images_batches[:,:,b].T
        Y = labels_batches[:,:,b].T
        dw1, dw2, dw3, db1, db2, db3, a3 = forward_and_back_prop(X, Y, w1, w2, w3, b1, b2, b3, 5000)
        w1 = w1 - alpha*dw1
        w2 = w2 - alpha*dw2
        w3 = w3 - alpha*dw3
        b1 = b1 - db1
        b2 = b2 - db2
        b3 = b3 - db3
    print("epoch: ", i, "cost: ", cost(Y, a3, m))

epoch:  0 cost:  [2.3070391]
epoch:  1 cost:  [2.30546889]
epoch:  2 cost:  [2.30445324]
epoch:  3 cost:  [2.30378274]
epoch:  4 cost:  [2.30333112]
epoch:  5 cost:  [2.30302093]
epoch:  6 cost:  [2.30280387]
epoch:  7 cost:  [2.3026493]
epoch:  8 cost:  [2.30253744]
epoch:  9 cost:  [2.30245532]
epoch:  10 cost:  [2.30239426]
epoch:  11 cost:  [2.30234837]
epoch:  12 cost:  [2.30231355]
epoch:  13 cost:  [2.30228694]
epoch:  14 cost:  [2.30226647]
epoch:  15 cost:  [2.30225064]
epoch:  16 cost:  [2.30223835]
epoch:  17 cost:  [2.30222877]
epoch:  18 cost:  [2.30222129]
epoch:  19 cost:  [2.30221543]
epoch:  20 cost:  [2.30221083]
epoch:  21 cost:  [2.30220722]
epoch:  22 cost:  [2.30220438]
epoch:  23 cost:  [2.30220215]
epoch:  24 cost:  [2.30220039]
epoch:  25 cost:  [2.302199]
epoch:  26 cost:  [2.30219791]
epoch:  27 cost:  [2.30219705]
epoch:  28 cost:  [2.30219637]
epoch:  29 cost:  [2.30219583]
epoch:  30 cost:  [2.3021954]
epoch:  31 cost:  [2.30219507]
epoch:  32 cost:  [2.30

In [34]:
X = images_batches_normalized[:,:,3].T
Y = labels_batches[:,:,3].T
print(accuracy(X, Y, w1, w2, w3, b1, b2, b3, m))

89.0
