In [1]:
import numpy as np
# Use torchvision to load MNIST dataset
import torchvision
# Training dataset
train = torchvision.datasets.MNIST('./data', train=True, download=True)
X_train = train.data.numpy().reshape(-1, 784) / 255.0
y_train = train.targets.numpy()
# Test dataset
test = torchvision.datasets.MNIST('./data', train=False, download=True)
X_test = test.data.numpy().reshape(-1, 784) / 255.0
y_test = test.targets.numpy()

In [2]:
class MLP:
    def __init__(self):
        self.w1 = 0.01 * np.random.randn(784, 64)
        self.b1 = np.zeros((1, 64))
        self.w2 = 0.01 * np.random.randn(64, 16)
        self.b2 = np.zeros((1, 16))
        self.w3 = 0.01 * np.random.randn(16, 10)
        self.b3 = np.zeros((1, 10))
        
    def forward(self, X):
        self.X = X
        # relu
        self.X1 = self.X @ self.w1 + self.b1
        self.X1a = np.maximum(self.X1, 0)
        # relu
        self.X2 = self.X1a @ self.w2 + self.b2
        self.X2a = np.maximum(self.X2, 0)
        # softmax
        self.X3 = self.X2a @ self.w3 + self.b3
        eX = np.exp(self.X3)
        self.output = eX / np.sum(eX, axis=-1, keepdims=True)
        return self.output

    def backward(self, gt, step_size=0.5, lambda_=0.001):
        batch_size = len(self.output)
        # cross entropy
        # loss = np.sum(-np.log(self.output[range(batch_size),gt]))/batch_size
        # print(loss)
        # back propagation 
        diff = self.output
        diff[range(batch_size),gt] -= 1
        diff /= batch_size
        dw3 = self.X2a.T @ diff
        db3 = np.sum(diff, axis=0, keepdims=True)
        diff = diff @ self.w3.T
        diff[self.X2 <= 0] = 0
        dw2 = self.X1a.T @ diff
        db2 = np.sum(diff, axis=0, keepdims=True)
        diff = diff @ self.w2.T
        diff[self.X1 <= 0] = 0
        dw1 = self.X.T @ diff
        db1 = np.sum(diff, axis=0, keepdims=True)
        # regulization
        dw3 += lambda_ * self.w3
        dw2 += lambda_ * self.w2
        dw1 += lambda_ * self.w1
        # gradient descent
        self.w3 -= step_size * dw3
        self.b3 -= step_size * db3
        self.w2 -= step_size * dw2
        self.b2 -= step_size * db2
        self.w1 -= step_size * dw1
        self.b1 -= step_size * db1

In [3]:
model = MLP()

In [4]:
# train
for i in range(1000):
    sample = np.random.choice(range(60000), 1000)
    pred = model.forward(X_train[sample])
    model.backward(y_train[sample])

In [5]:
# training set accuracy
np.sum(np.argmax(model.forward(X_train), axis=1) == y_train) / 60000

0.9690666666666666

In [6]:
# test set accuracy
np.sum(np.argmax(model.forward(X_test), axis=1) == y_test) / 10000

0.964