# MNIST Digit Classification with our own Framework

Lab Assignment from [AI for Beginners Curriculum](https://github.com/microsoft/ai-for-beginners).

### Reading the Dataset

This code download the dataset from the repository on the internet. You can also manually copy the dataset from `/data` directory of AI Curriculum repo.

In [2]:
# !rm *.pkl
# !wget https://raw.githubusercontent.com/microsoft/AI-For-Beginners/main/data/mnist.pkl.gz
# !gzip -d mnist.pkl.gz

import urllib.request
import gzip
import shutil
import os

# Delete any existing mnist.pkl files
if os.path.exists("mnist.pkl"):
    os.remove("mnist.pkl")
if os.path.exists("mnist.pkl.gz"):
    os.remove("mnist.pkl.gz")

# Download the file
url = "https://raw.githubusercontent.com/microsoft/AI-For-Beginners/main/data/mnist.pkl.gz"
urllib.request.urlretrieve(url, "mnist.pkl.gz")

# Unzip it
with gzip.open("mnist.pkl.gz", "rb") as f_in:
    with open("mnist.pkl", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)


In [4]:
import pickle
with open('mnist.pkl','rb') as f:
    MNIST = pickle.load(f, encoding='latin1')

In [22]:
train = 0
labels = 1
features = 0
labels = MNIST[train][labels]
data = MNIST[train][features]

Let's see what is the shape of data that we have:

In [18]:
data.shape

(50000, 784)

### Splitting the Data

We will use Scikit Learn to split the data between training and test dataset:

In [23]:
from sklearn.model_selection import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(data,labels,test_size=0.2)

print(f"Train samples: {len(features_train)}, test samples: {len(features_test)}")

Train samples: 40000, test samples: 10000


### Instructions

1. Take the framework code from the lesson and paste it into this notebook, or (even better) into a separate Python module
1. Define and train one-layered perceptron, observing training and validation accuracy during training
1. Try to understand if overfitting took place, and adjust layer parameters to improve accuracy
1. Repeat previous steps for 2- and 3-layered perceptrons. Try to experiment with different activation functions between layers.
1. Try to answer the following questions:
    - Does the inter-layer activation function affect network performance?
    - Do we need 2- or 3-layered network for this task?
    - Did you experience any problems training the network? Especially as the number of layers increased.
    - How do weights of the network behave during training? You may plot max abs value of weights vs. epoch to understand the relation.

In [24]:
import numpy as np
np.random.seed(7)

In [31]:
class Linear: 
    def __init__(self, n_in, n_out):
        self.W = np.random.normal(0, 1.0/np.sqrt(n_in), (n_out, n_in))
        self.b = np.zeros((1, n_out))
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)

    def forward(self, x):
        self.x = x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx

    def update(self, lr):
        self.W -= lr * self.dW
        self.b -= lr * self.db    

In [38]:
class Softmax: 
    def forward(self, z):
        self.z = z
        zmax = z.max(axis=1, keepdims=True)
        expz = np.exp(z - zmax)
        Z = expz.sum(axis=1, keepdims=True)
        return expz / Z
    
    def backward(self, dp):
        p = self.forward(self.z)
        pdp = p * dp
        return pdp - p * pdp.sum(axis=1, keepdims=True)

In [41]:
class CrossEntropyLoss:
    def forward(self, p, y):
        self.p = p
        self.y = y
        p_of_y = p[np.arange(len(y)), y]
        log_prob = np.log(p_of_y)
        return -log_prob.mean()
    
    def backward(self, loss):
        dlog_softmax = np.zeros_like(self.p)
        dlog_softmax[np.arange(len(self.y)), self.y] -= 1.0/len(self.y)
        return dlog_softmax / self.p

In [48]:
class ReLU: 
    def forward(self, x):
        self.x = x
        return np.maximum(0, x)
    
    def backward(self, dy):
        return dy * (self.x > 0)

In [49]:
class LeakyReLU:
    def __init__(self, alpha=0.01):
        self.alpha = alpha

    def forward(self, x):
        self.x = x
        return np.where(x > 0, x, self.alpha * x)
    
    def backward(self, dy):
        dx = np.ones_like(self.x)
        dx[self.x < 0] = self.alpha
        return dy * dx

In [47]:
class Tanh: 
    def forward(self, x):
        y = np.tanh(x)
        self.y = y
        return y

    def backward(self, dy):
        return (1.0 - self.y**2) * dy

In [34]:
class Net: 
    def __init__(self):
        self.layers = []

    def add(self, l): 
        self.layers.append(l)

    def forward(self, x):
        for l in self.layers:
            x = l.forward(x)
        return x
    
    def backward(self, z):
        for l in self.layers[::-1]: 
            z = l.backward(z)
        return z
    
    def update(self, lr):
        for l in self.layers: 
            if 'update' in l.__dir__(): 
                l.update(lr)

In [63]:
def get_loss_acc(net, x, y, loss=CrossEntropyLoss()):
    p = net.forward(x)
    l = loss.forward(p, y)
    pred = np.argmax(p, axis=1)
    acc = (pred == y).mean()
    return l, acc
def initialLoss(net, features_train, labels_train):
    print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(net, features_train, labels_train)))


# If the model isn't learning, lower the learning rate.
# If it's very unstable or oscillating wildly, try smaller batches or a lower LR.
def train_epoch(net, train_x, train_labels, loss=CrossEntropyLoss(), batch_size=32, lr=0.1):
    for i in range(0, len(train_x), batch_size):
        xb = train_x[i:i+batch_size]
        yb = train_labels[i:i+batch_size]

        p = net.forward(xb)
        l = loss.forward(p, yb)
        dp = loss.backward(l)
        dx = net.backward(dp)
        net.update(lr)

def finalLoss(net, features_train, labels_train, features_test, labels_test):
    print("Final loss={}, accuracy={}: ".format(*get_loss_acc(net, features_train, labels_train)))
    print("Test loss={}, accuracy={}: ".format(*get_loss_acc(net, features_test, labels_test)))

### 1 layered perceptron

In [65]:
net1 = Net()
net1.add(Linear(784, 10))
net1.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net1, features_train, labels_train)
train_epoch(net1, features_train, labels_train)
finalLoss(net1, features_train, labels_train, features_test, labels_test)


Initial loss=2.4311445390307678, accuracy=0.102625: 
Final loss=0.341167945007791, accuracy=0.90535: 
Test loss=0.35382201750259085, accuracy=0.9015: 


### 2 layered perceptron (relu, leakyrelu, tanh)

In [66]:
net2relu = Net()
net2relu.add(Linear(784, 10))
net2relu.add(ReLU())
net2relu.add(Linear(10, 10)) # NOTE: YOU'RE ONLY TAKING IN 10 POSSIBLE VALUES NOW INSTEAD OF 784
net2relu.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net2relu, features_train, labels_train)
train_epoch(net2relu, features_train, labels_train)
finalLoss(net2relu, features_train, labels_train, features_test, labels_test)

Initial loss=2.310911715212786, accuracy=0.08835: 
Final loss=0.32665982160303914, accuracy=0.907175: 
Test loss=0.34228270062080535, accuracy=0.8994: 


In [67]:
net2leakyrelu = Net()
net2leakyrelu.add(Linear(784, 10))
net2leakyrelu.add(LeakyReLU())
net2leakyrelu.add(Linear(10, 10))
net2leakyrelu.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net2leakyrelu, features_train, labels_train)
train_epoch(net2leakyrelu, features_train, labels_train)
finalLoss(net2leakyrelu, features_train, labels_train, features_test, labels_test)

Initial loss=2.296934612477449, accuracy=0.10695: 
Final loss=0.2793492259703594, accuracy=0.921875: 
Test loss=0.29432482968541596, accuracy=0.9187: 


In [69]:
net2tanh = Net()
net2tanh.add(Linear(784, 10))
net2tanh.add(Tanh())
net2tanh.add(Linear(10, 10))
net2tanh.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net2tanh, features_train, labels_train)
train_epoch(net2tanh, features_train, labels_train)
finalLoss(net2tanh, features_train, labels_train, features_test, labels_test)

Initial loss=2.4187332967435298, accuracy=0.035575: 
Final loss=0.31791654024581595, accuracy=0.911475: 
Test loss=0.33970428208762987, accuracy=0.9039: 


### 3 layered perceptron

In [70]:
net3relu = Net()
net3relu.add(Linear(784, 10))
net3relu.add(ReLU())
net3relu.add(Linear(10, 10))
net3relu.add(ReLU())
net3relu.add(Linear(10, 10))
net3relu.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net3relu, features_train, labels_train)
train_epoch(net3relu, features_train, labels_train)
finalLoss(net3relu, features_train, labels_train, features_test, labels_test)

Initial loss=2.3206753725149887, accuracy=0.10395: 
Final loss=0.31911663857978356, accuracy=0.9065: 
Test loss=0.3488052702104129, accuracy=0.9014: 


In [71]:
net3leakyrelu = Net()
net3leakyrelu.add(Linear(784, 10))
net3leakyrelu.add(LeakyReLU())
net3leakyrelu.add(Linear(10, 10))
net3leakyrelu.add(LeakyReLU())
net3leakyrelu.add(Linear(10, 10))
net3leakyrelu.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net3leakyrelu, features_train, labels_train)
train_epoch(net3leakyrelu, features_train, labels_train)
finalLoss(net3leakyrelu, features_train, labels_train, features_test, labels_test)

Initial loss=2.3105634037803227, accuracy=0.1427: 
Final loss=0.3088372990306781, accuracy=0.911425: 
Test loss=0.3286176962788192, accuracy=0.9053: 


In [72]:
net3tanh = Net()
net3tanh.add(Linear(784, 10))
net3tanh.add(Tanh())
net3tanh.add(Linear(10, 10))
net3tanh.add(Tanh())
net3tanh.add(Linear(10, 10))
net3tanh.add(Softmax())
loss = CrossEntropyLoss()

initialLoss(net3tanh, features_train, labels_train)
train_epoch(net3tanh, features_train, labels_train)
finalLoss(net3tanh, features_train, labels_train, features_test, labels_test)

Initial loss=2.3556883787943903, accuracy=0.102675: 
Final loss=0.3596431074976036, accuracy=0.898475: 
Test loss=0.37605853686010343, accuracy=0.8914: 
