In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import time

def interpret_output(A2):
    return np.argmax(A2, axis=0)[0]

def encode(y):
    encoded = np.zeros((10, y.size))
    encoded[y, np.arange(y.size)] = 1
    return encoded

def sigmoid(x, deriv=False):
    sig = 1/(1 + np.exp(-x))
    if(deriv):
        return sig*(1-sig)
    return sig


def init_params():
    W1 = np.random.uniform(-0.5, 0.5, (128, 784)).astype('float128')
    b1 = np.zeros((128,1)).astype('float128')
    W2 = np.random.uniform(-0.5, 0.5, (10, 128)).astype('float128')
    b2 = np.zeros((10,1)).astype('float128')
    return W1, b1, W2, b2

def forward_prop(W1, b1, W2, b2, A0):
    Z1=W1.dot(A0) + b1
    A1=sigmoid(Z1)
    Z2=W2.dot(A1) + b2
    A2=sigmoid(Z2)
    return A1, Z1, A2, Z2

def back_prop(A1, Z1, A2, Z2, A0, y):
    #cost function = 1/2 * sum((A2_i - Y_i)^2)
    m=y.size
    dZ2 = (A2 - encode(y))*sigmoid(Z2, deriv=True)
    dW2 = 1/m * dZ2.dot(np.transpose(A1))
    db2 = 1/m * np.sum(dZ2, axis=1)
    db2.shape += (1,)
    dZ1 = np.transpose(dW2).dot(dZ2) * sigmoid(Z1, deriv=True)
    dW1 = 1/m * dZ1.dot(np.transpose(A0))
    db1 = 1/m * np.sum(dZ1, axis=1)
    db1.shape += (1,)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha*dW1
    W2 = W2 - alpha*dW2
    b1 = b1 - alpha*db1
    b2 = b2 - alpha*db2
    return W1, b1, W2, b2

def train(data, epochs, batch_size, l_rate):
    batch_num=int(np.floor(len(data)/batch_size))
    indexes=np.arange(batch_num)
    fill_batches = lambda idx: data[idx*batch_size:(idx+1)*batch_size, 1:785].T
    fill_targets = lambda idx: data[idx*batch_size:(idx+1)*batch_size, 0]
    batches=np.array([fill_batches(idx) for idx in indexes])
    targets=np.array([fill_targets(idx) for idx in indexes])
    W1, b1, W2, b2 = init_params()
    for i in range(epochs):
        for batch, y in zip(batches, targets):
            A1, Z1, A2, Z2 = forward_prop(W1, b1, W2, b2, batch)
            dW1, db1, dW2, db2 = back_prop(A1, Z1, A2, Z2, batch, y)
            W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, l_rate)
    return W1, b1, W2, b2

def test(data, W1, b1, W2, b2):
    results = 0
    for i in range(len(data)):
        A0 = data[i][1:785]
        A0.shape += (1,)
        y = np.array(data[i][0])
        A1, Z1, A2, Z2 = forward_prop(W1, b1, W2, b2, A0)
        res = interpret_output(A2) == y
        if(res):
            results = results + 1
    print('accuracy:' , results/len(data) * 100 , "%")

In [2]:
data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
data = np.array(data)

In [3]:
st = time.time()
a, b, c, d = train(data[0:20000], 100, 200, 0.1)
et = time.time()
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

Execution time: 865.8242585659027 seconds


In [4]:
test(data[20000:40000], a, b, c, d )

accuracy: 79.735 %
