In [7]:
import numpy as np   
from copy import deepcopy
from torchvision.datasets import MNIST   #Torchvision used only to import the MNIST dataset!
from torchvision.transforms import ToTensor     #it plays no role in the model calculations
import matplotlib.pyplot as plt

In [8]:
def sigmoid(x):    #sigmoid
    return 1 / (1 + np.exp(-x / 10))

def sig(x):    #numpy vectorized sigmoid
    y = np.vectorize(sigmoid)
    return y(x)

def sig_diff(x):   #sigmoid derivative
    return sig(x) * (1 - sig(x)) 

def diff_ReLU(x):   #ReLU derivative
    return 1 if x > 0 else 0

def dReLU(x):       #vectorized ReLU derivative
    y = np.vectorize(diff_ReLU)
    return y(x)

def ReLU(x):        #ReLU
    return np.maximum(x,0)

In [9]:
class Neural():  #class for the neural network
    
    def __init__(self, LR): #sizes are static but the code is flexible so they can easily be changed
        self.LR = LR        #or turned into __init__ parameters 
        self.weights1 = np.random.uniform(-0.5, 0.5, size = (2048, 28*28))
        self.weights2 = np.random.uniform(-0.5, 0.5, size = (10, 2048))
        self.bias1 = np.zeros(2048)
        self.bias2 = np.zeros(10)
        self.deltaw1, self.deltaw2, self.deltab1, self.deltab2 = 0, 0, 0, 0
        #LR - Learning rate, deltas store gradients for a batch
        
    def forward(self, x):
        self.logits = [x]
        self.activations = []
        x = np.matmul(self.weights1, x) + self.bias1
        self.logits.append(x)
        x = ReLU(x)
        self.activations.append(x)
        x = np.matmul(self.weights2, x) + self.bias2
        self.logits.append(x)
        x = sig(x)
        self.activations.append(x)
        #forward pass, saves logits under self.logits and
        #activations under self.activations
            
    def loss_d(self, target):
        if type(target) != {list, np.array}:
            Y = np.zeros(len(self.bias2))
            Y[target] = 1
            target = Y
        length = len(target)
        x = -2 * (self.activations[1] - target) / 10
        return x
        #returns gradient of the loss function w.r.t. activations in last layer
    
    def grad2(self, target):
        x = []
        for j in range(len(self.weights2[1])):
            row = np.sum(self.dzda * self.weights2[:, j]) * dReLU(self.logits[1][j])
            x.append(row)
        x = np.array(x)
        return x
        #returns gradient of loss fn w.r.t activations in layer 1
        
    def backward(self, target):
        self.dzda =  self.loss_d(target) * sig_diff(self.logits[2])
        #dzda stores gradient of activations in last layer (activations2) w.r.t.
        #logits in each layer, updating along, here layer 2 (last)
        
        self.dw2 = np.matmul(np.expand_dims(self.dzda, 1), np.expand_dims(self.activations[0].T, 0))
        #gradient of loss w.r.t. weigths 2
        self.db2 = deepcopy(self.dzda)
        #gradient of loss w.r.t. bias 2
        
        self.dzda = self.grad2(target)
        #here dzda shows grad of activations2 w.r.t. logits 1 
        
        self.db1 = deepcopy(self.dzda)
        #gradient of loss w.r.t. bias 1
        self.dw1 = np.dot( np.expand_dims(self.dzda, 1), np.expand_dims(self.logits[0].T, 0) )
        #gradient of  loss w.r.t. weigths 1
        
        self.deltaw1 += self.dw1 * self.LR
        self.deltaw2 += self.dw2 * self.LR
        self.deltab1 += self.db1 * self.LR
        self.deltab2 += self.db2 * self.LR
        #storing gradients of params in a batch to apply an average afterwards
        
    def train(self, data):  #training the model, data must be a list, np.array
        self.deltaw1, self.deltaw2, self.deltab1, self.deltab2, = 0, 0, 0, 0
        #reseting the gradients 
        length = len(data)
        for item in data:
            (X, Y) = item      #X - logits 0, Y - label
            X = X.numpy().flatten()  #Flattening the image from a matrix into a 1d array
            self.forward(X)    #forward pass
            self.backward(Y)   #backward pass, calculating gradients
        self.weights1 += self.deltaw1 / length
        self.weights2 += self.deltaw2 / length
        self.bias1 += self.deltab1 / length
        self.bias2 += self.deltab2 / length
        #adding the average of gradients in the batch

In [10]:
nn = Neural(LR = 0.15)

In [11]:
train_data = MNIST(root = 'MNIST',
                   train = True,
                   download = False,   #set to True to download
                   transform = ToTensor(),
                   target_transform = None)
test_data = MNIST(root = 'MNIST',
                  train =False,
                  download = False,   #set to True to download
                  transform = ToTensor(),
                  target_transform = None)
                  #setting up train and test data
train_data_1 = list(train_data)
test_data_1 = list(test_data)
train_data_whole = deepcopy(train_data_1)

In [12]:
measurements = [[],[],[],[]]  #estabilishing a vector of measurements to be able to plot/examine the progress

In [None]:
nn.LR = 0.01
i = 0  #batch number
Batch_Size = 16
while len(train_data_1) > 0:   #one pass over the training dataset

    if i % 5 == 0:   #evaluate the metrics every 5th batch
        loss_total, loss_train, acc_total, acc_train = 0, 0, 0, 0 #init measured values
        for test in test_data_1:
            X, Y = test
            X = X.numpy().flatten()
            y = np.zeros(10)
            y[Y] = 1
            nn.forward(X)
            loss = nn.activations[1] - y
            loss = np.sum([x ** 2 for x in loss]) / 10  #computing loss function
            loss_total += loss
            if np.argmax(nn.activations[1]) == Y:  #checking if prediction matches label
                acc_total += 1

        for test in train_data_whole:
            X, Y = test
            X = X.numpy().flatten()
            y = np.zeros(10)
            y[Y] = 1
            nn.forward(X)
            loss = nn.activations[1] - y
            loss = np.sum([x ** 2 for x in loss]) / 10
            loss_train += loss
            if np.argmax(nn.activations[1]) == Y:
                acc_train += 1
        print(f'{i} | Test_set: loss : {round(loss_total / 10000, 4)}, \
        acc : {round(acc_total / 100, 3)}% \
        | Train_set: loss : {round(loss_train / 60000, 4)}, \
        acc : {round(acc_train / 600, 3)}%')
        measurements[0].append(loss_total / 10000)
        measurements[1].append(acc_total / 100)
        measurements[2].append(loss_train / 60000)
        measurements[3].append(acc_train / 600)

    nn.train(train_data_1[:Batch_Size:])
    train_data_1 = train_data_1[Batch_Size::]
    #moving onto the next batch

    i += 1