Dependencies and dataset

In [5]:
import numpy as np
import pandas as pd
import zipfile
import os

In [2]:
!kaggle datasets download -d oddrationale/mnist-in-csv -p ./data
with zipfile.ZipFile('./data/mnist-in-csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./data')
os.remove('./data/mnist-in-csv.zip')

Downloading mnist-in-csv.zip to ./data




  0%|          | 0.00/15.2M [00:00<?, ?B/s]
  7%|▋         | 1.00M/15.2M [00:00<00:08, 1.72MB/s]
 20%|█▉        | 3.00M/15.2M [00:00<00:02, 4.80MB/s]
 39%|███▉      | 6.00M/15.2M [00:00<00:00, 9.75MB/s]
 72%|███████▏  | 11.0M/15.2M [00:01<00:00, 18.1MB/s]
100%|██████████| 15.2M/15.2M [00:01<00:00, 14.5MB/s]


Define cost and activation functions

In [39]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

def cost(y,x):
    #return np.sum(0.5*(y-x)**2)
    return np.sum(-y*np.log(x) - (1-y)*np.log(1-x))

#def cost_prime(y,x):
#    return x-y

def delta(y,x):
    return x-y

Define Network class

In [51]:
class Network:
    def __init__(self, layers):
        self.L = layers
        self.W = [np.random.randn(x,y)/np.sqrt(y) for x,y in zip(self.L[1:],self.L[0:-1])]
        self.B = [np.random.randn(x,1) for x in self.L[1:]]

    def feedforward(self,a):
        a=a.reshape(-1,1)
        for w,b in zip(self.W,self.B):
            a = sigmoid(np.dot(w,a)+b)
        return a

    def SGD(self, train, batch_size, epochs, eta, lmbda, valid=None):
        X_train, Y_train = train[0], train[1]
        for epoch in range(epochs):
            X_batches = np.array_split(X_train, X_train.shape[1] // batch_size, axis=1)
            Y_batches = np.array_split(Y_train, Y_train.shape[1] // batch_size, axis=1)
            for X_batch, Y_batch in zip(X_batches, Y_batches):  
                nabla_B = [np.zeros(b.shape) for b in self.B]
                nabla_W = [np.zeros(w.shape) for w in self.W]
                for i in range(X_batch.shape[1]):
                    a = X_batch[:,i].reshape(-1,1)
                    y = Y_batch[:,i].reshape(-1,1)
                    W_shifts, B_shifts = self.train(a, y)
                    nabla_B = [nb+dnb for nb, dnb in zip(nabla_B, B_shifts)] 
                    nabla_W = [nw+dnw for nw, dnw in zip(nabla_W, W_shifts)]
                self.W = [w-eta*nw/X_batch.shape[1] - eta*lmbda*w/X_train.shape[1] for w, nw in zip(self.W, nabla_W)] 
                self.B = [b-eta*nb/X_batch.shape[1] for b, nb in zip(self.B, nabla_B)]
            if valid is not None:
                print("epoch: ", epoch, "ACC: ", self.evaluate(valid[0], valid[1])[1])
                
    def train(self, a, y):
        Z=[]
        A=[]
        A.append(a)
        for w,b in zip(self.W,self.B):
            z = np.dot(w,A[-1])+b
            a=sigmoid(z)
            Z.append(z)
            A.append(a)
        return self.__backprob(y, A, Z)

    def __backprob(self,y,A,Z):
        D = []
        #D.append(cost_prime(y,A[-1])*sigmoid_prime(Z[-1]))
        D.append(delta(y,A[-1]))
        for i in range(1,len(Z)):
            D.insert(0, np.dot(self.W[-i].T,D[0])*sigmoid_prime(Z[-i-1]))
        B_shifts = D
        W_shifts = []
        for a,d in zip(A[0:-1],D):
            W_shifts.append(np.dot(d,a.T))
        return W_shifts, B_shifts

    def evaluate(self, X_test, Y_test):
        correct_predictions = 0
        for i in range(X_test.shape[1]):
            
            a = X_test[:,i]
            output = self.feedforward(a)
            prediction = np.argmax(output)
            if prediction == np.argmax(Y_test[:,i]):
                correct_predictions += 1
        return correct_predictions, correct_predictions/(i+1)


`feedforward()` - calculates the output of the network.
<br>
`train()` - prepares training  for training (each training input needs to be a 2-D array (x,1)), then it calls `learn()`.
<br>
`learn()` - calculates activations in each layer and stores it in A list (similar for z, where z is a dot product of weights and previous activation plus bias). Then it calls `backprob()`.
<br>
`backprob()` - performs backpropagation to update weights and biases.

Preprocess data

In [7]:
train = pd.read_csv('data\mnist_train.csv').to_numpy()
test = pd.read_csv('data\mnist_test.csv').to_numpy()

X_train, Y_train = train[:,1:] / 255 , train[:,0]
X_test, Y_test = test[:,1:] / 255, test[:,0] 

#permuation = np.random.permutation(X_train.shape[1])
X_train=X_train.T
X_valid = X_train[:,:10000]
X_train = X_train[:,10000:]
X_test=X_test.T

Y_train = np.eye(10)[Y_train].T
Y_valid = Y_train[:,:10000]
Y_train = Y_train[:,10000:]
Y_test = np.eye(10)[Y_test].T

Set up network

In [52]:
net = Network([784,30,10])
train = [X_train, Y_train]
valid = [X_valid, Y_valid]

In [55]:
net.SGD(train, batch_size=10, epochs=30, eta=0.5, lmbda=5.0, valid=valid)

epoch:  0 ACC:  0.9471
epoch:  1 ACC:  0.949
epoch:  2 ACC:  0.9509
epoch:  3 ACC:  0.9519
epoch:  4 ACC:  0.9529
epoch:  5 ACC:  0.9533
epoch:  6 ACC:  0.9519
epoch:  7 ACC:  0.9515
epoch:  8 ACC:  0.9519
epoch:  9 ACC:  0.9522
epoch:  10 ACC:  0.952
epoch:  11 ACC:  0.9507
epoch:  12 ACC:  0.9507
epoch:  13 ACC:  0.9479
epoch:  14 ACC:  0.9489
epoch:  15 ACC:  0.9489
epoch:  16 ACC:  0.9483
epoch:  17 ACC:  0.9492
epoch:  18 ACC:  0.948
epoch:  19 ACC:  0.9492
epoch:  20 ACC:  0.95
epoch:  21 ACC:  0.9514
epoch:  22 ACC:  0.9511
epoch:  23 ACC:  0.9512
epoch:  24 ACC:  0.9506
epoch:  25 ACC:  0.9496
epoch:  26 ACC:  0.9488
epoch:  27 ACC:  0.9493
epoch:  28 ACC:  0.9497
epoch:  29 ACC:  0.9503


In [50]:
#net.evaluate(X_test, Y_test)
X_train.shape[1]

50000