Dependencies and dataset

In [1]:
import numpy as np
import pandas as pd
import zipfile
import os

In [2]:
!kaggle datasets download -d oddrationale/mnist-in-csv -p ./data
with zipfile.ZipFile('./data/mnist-in-csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')
os.remove('./data/mnist-in-csv.zip')

Downloading mnist-in-csv.zip to ./data




  0%|          | 0.00/15.2M [00:00<?, ?B/s]
  7%|▋         | 1.00M/15.2M [00:00<00:09, 1.61MB/s]
 20%|█▉        | 3.00M/15.2M [00:00<00:02, 4.60MB/s]
 39%|███▉      | 6.00M/15.2M [00:00<00:01, 9.63MB/s]
 72%|███████▏  | 11.0M/15.2M [00:01<00:00, 18.3MB/s]
 98%|█████████▊| 15.0M/15.2M [00:01<00:00, 23.5MB/s]
100%|██████████| 15.2M/15.2M [00:01<00:00, 14.0MB/s]


Define cost and activation functions

In [3]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

def cost(y,x):
    return np.sum(0.5*(y-x)**2)

def cost_prime(y,x):
    return x-y

Define Network class

In [184]:
class Network:
    def __init__(self, layers):
        self.L = layers
        self.W = [np.random.randn(x,y) for x,y in zip(self.L[1:],self.L[0:-1])]
        self.B = [np.random.randn(x,1) for x in self.L[1:]]

    def feedforward(self,a):
        a=a.reshape(-1,1)
        for w,b in zip(self.W,self.B):
            a = sigmoid(np.dot(w,a)+b)
        return a



    def train(self, X_train, Y_train, batch_size, epochs, eta):
        for epoch in range(epochs):
            X_batches = np.array_split(X_train, X_train.shape[1] // batch_size, axis=1)
            Y_batches = np.array_split(Y_train, Y_train.shape[1] // batch_size, axis=1)
            for X_batch, Y_batch in zip(X_batches, Y_batches):  
                nabla_B = [np.zeros(b.shape) for b in self.B]
                nabla_W = [np.zeros(w.shape) for w in self.W]
                for i in range(X_batch.shape[1]):
                    a = X_batch[:,i].reshape(-1,1)
                    y = Y_batch[:,i].reshape(-1,1)
                    W_shifts, B_shifts = self.learn(a, y, eta)
                    nabla_B = [nb+dnb for nb, dnb in zip(nabla_B, B_shifts)] 
                    nabla_W = [nw+dnw for nw, dnw in zip(nabla_W, W_shifts)]
                self.W = [w-eta*nw/X_batch.shape[1]  for w, nw in zip(self.W, nabla_W)] 
                self.B = [b-eta*nb/X_batch.shape[1] for b, nb in zip(self.B, nabla_B)]
            print("epoch: ", epoch+1, "cost: ", self.evaluate(X_train, Y_train)[1])
                
    def learn(self, a, y, eta):
        Z=[]
        A=[]
        A.append(a)
        for w,b in zip(self.W,self.B):
            z = np.dot(w,A[-1])+b
            a=sigmoid(z)
            Z.append(z)
            A.append(a)

        return self.__backprob(y, A, Z, eta)

    def __backprob(self,y,A,Z, eta):
        D = []
        D.append(cost_prime(y,A[-1])*sigmoid_prime(Z[-1]))
        for i in range(1,len(Z)):
            D.insert(0, np.dot(self.W[-i].T,D[0])*sigmoid_prime(Z[-i-1]))
        B_shifts = D
        W_shifts = []
        for a,d in zip(A[0:-1],D):
            W_shifts.append(np.dot(d,a.T))
        return W_shifts, B_shifts

    def evaluate(self, X_test, Y_test):
        correct_predictions = 0
        for i in range(X_test.shape[1]):
            
            a = X_test[:,i]
            output = self.feedforward(a)
            prediction = np.argmax(output)
            if prediction == np.argmax(Y_test[:,i]):
                correct_predictions += 1
        return correct_predictions, correct_predictions/(i+1)


`feedforward()` - calculates the output of the network.
<br>
`train()` - prepares training  for training (each training input needs to be a 2-D array (x,1)), then it calls `learn()`.
<br>
`learn()` - calculates activations in each layer and stores it in A list (similar for z, where z is a dot product of weights and previous activation plus bias). Then it calls `backprob()`.
<br>
`backprob()` - performs backpropagation to update weights and biases.

Preprocess data

In [37]:
train = pd.read_csv('data\mnist_train.csv').to_numpy()
test = pd.read_csv('data\mnist_test.csv').to_numpy()

X_train, Y_train = train[:,1:] / 255 , train[:,0]
X_test, Y_test = test[:,1:] / 255, test[:,0] 

X_train=X_train.T
X_test=X_test.T

Y_train = np.eye(10)[Y_train].T
Y_test = np.eye(10)[Y_test].T

Set up network

In [190]:
net = Network([784,30,10])
a=X_train[:,0:]
y=Y_train[:,0:]

In [191]:
net.train(X_train=a, Y_train=y, batch_size=10, epochs=30, eta=3.0)

epoch:  1 cost:  0.90945
epoch:  2 cost:  0.9284
epoch:  3 cost:  0.9386833333333333
epoch:  4 cost:  0.94145
epoch:  5 cost:  0.9448833333333333
epoch:  6 cost:  0.9482166666666667
epoch:  7 cost:  0.95175
epoch:  8 cost:  0.9528
epoch:  9 cost:  0.9570833333333333
epoch:  10 cost:  0.9574833333333334
epoch:  11 cost:  0.95935
epoch:  12 cost:  0.9584166666666667
epoch:  13 cost:  0.9598833333333333
epoch:  14 cost:  0.9615833333333333
epoch:  15 cost:  0.9614833333333334
epoch:  16 cost:  0.9650333333333333
epoch:  17 cost:  0.9651
epoch:  18 cost:  0.96555
epoch:  19 cost:  0.9667166666666667
epoch:  20 cost:  0.96595
epoch:  21 cost:  0.9656666666666667
epoch:  22 cost:  0.9665
epoch:  23 cost:  0.96855
epoch:  24 cost:  0.9689333333333333
epoch:  25 cost:  0.9682333333333333
epoch:  26 cost:  0.9665333333333334
epoch:  27 cost:  0.9693333333333334
epoch:  28 cost:  0.96785
epoch:  29 cost:  0.9699666666666666
epoch:  30 cost:  0.9688666666666667


In [192]:
net.evaluate(X_test, Y_test)

(9519, 0.9519)