In [9]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

#reading training data into panda dataframe
data = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_train.csv')

Our NN will have a simple two-layer architecture. Input layer $a^{[0]}$ will have 784 units corresponding to the 784 pixels in each 28x28 input image. A hidden layer $a^{[1]}$ will have 10 units with ReLU activation, and finally our output layer $a^{[2]}$ will have 10 units corresponding to the ten digit classes with softmax activation.

**Forward propagation**

$$Z^{[1]} = W^{[1]} X + b^{[1]}$$
$$A^{[1]} = g_{\text{ReLU}}(Z^{[1]}))$$
$$Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}$$
$$A^{[2]} = g_{\text{softmax}}(Z^{[2]})$$

**Backward propagation**

$$dZ^{[2]} = A^{[2]} - Y$$
$$dW^{[2]} = \frac{1}{m} dZ^{[2]} A^{[1]T}$$
$$dB^{[2]} = \frac{1}{m} \Sigma {dZ^{[2]}}$$
$$dZ^{[1]} = W^{[2]T} dZ^{[2]} .* g^{[1]\prime} (z^{[1]})$$
$$dW^{[1]} = \frac{1}{m} dZ^{[1]} A^{[0]T}$$
$$dB^{[1]} = \frac{1}{m} \Sigma {dZ^{[1]}}$$

**Parameter updates**

$$W^{[2]} := W^{[2]} - \alpha dW^{[2]}$$
$$b^{[2]} := b^{[2]} - \alpha db^{[2]}$$
$$W^{[1]} := W^{[1]} - \alpha dW^{[1]}$$
$$b^{[1]} := b^{[1]} - \alpha db^{[1]}$$

**Vars and shapes**

Forward prop

- $A^{[0]} = X$: 784 x m
- $Z^{[1]} \sim A^{[1]}$: 10 x m
- $W^{[1]}$: 10 x 784 (as $W^{[1]} A^{[0]} \sim Z^{[1]}$)
- $B^{[1]}$: 10 x 1
- $Z^{[2]} \sim A^{[2]}$: 10 x m
- $W^{[1]}$: 10 x 10 (as $W^{[2]} A^{[1]} \sim Z^{[2]}$)
- $B^{[2]}$: 10 x 1

Backprop

- $dZ^{[2]}$: 10 x m ($~A^{[2]}$)
- $dW^{[2]}$: 10 x 10
- $dB^{[2]}$: 10 x 1
- $dZ^{[1]}$: 10 x m ($~A^{[1]}$)
- $dW^{[1]}$: 10 x 10
- $dB^{[1]}$: 10 x 1

In [10]:
#converting panda dataframe to numpy
data = np.array(data)
m, n = data.shape
np.random.shuffle(data) # shuffle the training set

# the data is of shape 60,000 x 785 
# 60,000 is the number of samples
# out of 785, index 0 is the output label (Y), and index 1 to 785 are the input image pixes (X)
# we need to take a transpose of "data" here so that each column represents a sample of image
data_train = data.T

#Separating label (Y) and input pixes (X)
Y_train = data_train[0]
X_train = data_train[1:n]

#Normalizing the input pixels to be within [0,1],
#this is needed otherwise Softmax layer in the Neural network overflows
X_train = X_train / 255.

In [11]:
#generating random data for Layer 1 and Layer 2
def init_params():
    W1 = np.random.rand(10, 784) - 0.5
    b1 = np.random.rand(10, 1) - 0.5
    W2 = np.random.rand(10, 10) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return W1, b1, W2, b2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    A = np.exp(Z) / sum(np.exp(Z))
    return A
    
def forward_prop(W1, b1, W2, b2, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def dReLU(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def backward_prop(Z1, A1, A2, W2, X, Y):
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * dReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr):
    W1 -= lr * dW1
    b1 -= lr * db1    
    W2 -= lr * dW2  
    b2 -= lr * db2    
    return W1, b1, W2, b2

In [12]:
def get_accuracy(predictions, Y): 
    #print(predictions, Y) 
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, lr, epochs): 
    W1, b1, W2, b2 = init_params()
    _,numsamples = X.shape
    batch = -(-numsamples//1000) #set batch = 1 for Normal Gradient Descent
    Xsplit = np.array(np.hsplit(X, batch))
    Ysplit = np.array(np.hsplit(Y, batch))

    for i in range(epochs): 
        for k in range (batch):
            Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, Xsplit[k]) 
            dW1, db1, dW2, db2 = backward_prop(Z1, A1, A2, W2, Xsplit[k], Ysplit[k]) 
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, lr) 
            #if i % 10 == 0: print("Iteration: ", i) 
            predictions = np.argmax(A2, 0)
            print(get_accuracy(predictions, Ysplit[k])) 
    return W1, b1, W2, b2

In [13]:
%time W1, b1, W2, b2 = gradient_descent(X_train, Y_train, lr=0.10, epochs=8)

0.1
0.116
0.126
0.16
0.143
0.158
0.19
0.192
0.222
0.214
0.234
0.236
0.262
0.241
0.262
0.3
0.28
0.297
0.295
0.32
0.324
0.296
0.327
0.317
0.331
0.332
0.336
0.371
0.36
0.368
0.381
0.386
0.394
0.398
0.423
0.432
0.444
0.425
0.419
0.463
0.468
0.497
0.492
0.489
0.459
0.499
0.528
0.54
0.518
0.517
0.55
0.554
0.537
0.528
0.55
0.588
0.547
0.547
0.59
0.575
0.581
0.592
0.619
0.612
0.571
0.597
0.626
0.619
0.626
0.602
0.613
0.628
0.65
0.622
0.65
0.641
0.642
0.654
0.661
0.667
0.652
0.649
0.662
0.642
0.661
0.652
0.655
0.68
0.656
0.693
0.689
0.671
0.678
0.666
0.698
0.697
0.679
0.667
0.658
0.695
0.695
0.697
0.691
0.695
0.671
0.689
0.711
0.725
0.71
0.697
0.7
0.712
0.681
0.712
0.695
0.73
0.702
0.69
0.721
0.719
0.724
0.722
0.737
0.735
0.698
0.717
0.74
0.743
0.728
0.692
0.712
0.724
0.75
0.713
0.739
0.73
0.752
0.732
0.731
0.754
0.748
0.737
0.756
0.735
0.734
0.731
0.736
0.738
0.739
0.766
0.758
0.734
0.76
0.729
0.76
0.767
0.745
0.752
0.722
0.755
0.765
0.75
0.76
0.757
0.721
0.744
0.765
0.778
0.755
0.74
0.749
0.7

Lets test the model.

In [14]:
testdata = pd.read_csv('/kaggle/input/mnist-in-csv/mnist_test.csv')

In [15]:
testdata.shape

(10000, 785)

In [16]:
testdata = np.array(testdata)
testdata = testdata.T
rtest, ctest = testdata.shape

In [17]:
Y_test = testdata[0]
X_test = testdata[1:rtest]
X_test = X_test / 255.

In [18]:
#Lets test a prediction
def show_image(img):
    current_image = img.reshape((28, 28)) * 255
    plt.gray()
    plt.imshow(current_image, interpolation='nearest')
    plt.show()

idx=1 #the index of the image we want to choose from the test set
    
img = X_test[:,idx, None]
print(Y_test[idx]) #this will print the "digit" or "label" with this image


2


In [19]:
_,_,_,A2 = forward_prop(W1, b1, W2, b2, img)
predicted_output = np.argmax(A2,0)
print(predicted_output) #predicted output, matches the label of the test image

[2]
