In [4]:
import numpy as np
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import torch
import time


class NeuralModel():
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def softmax(self, x):
        exps = np.exp(x)
        denom = np.sum(exps, axis=1)
        denom.resize(exps.shape[0], 1)
        return exps / denom

    def __init__(self, sizes, epochs=20, l_rate=0.01):
        self.sizes = sizes
        self.epochs = epochs
        self.l_rate = l_rate
        self.init_params()

    def init_params(self):
        input_layer = int(self.sizes[0])
        hidden_1 = int(self.sizes[1])
        hidden_2 = int(self.sizes[2])
        output_layer = int(self.sizes[3])

        # Random initialization of weights between -1 and 1
        self.w1 = np.random.uniform(low=-1, high=1, size=(input_layer, hidden_1))
        self.w2 = np.random.uniform(low=-1, high=1, size=(hidden_1, hidden_2))
        self.w3 = np.random.uniform(low=-1, high=1, size=(hidden_2, output_layer))

        # Zero initialization of weights
        #self.w1 = np.zeros((input_layer, hidden_1))
        #self.w2 = np.zeros((hidden_1, hidden_2))
        #self.w3 = np.zeros((hidden_2, output_layer))

    def forward(self, inputs):
        # Input layer to hidden layer1
        inputs = inputs.numpy()
        self.linear_1 = inputs.dot(self.w1)
        self.out1 = self.sigmoid(self.linear_1)

        # hidden layer 1 to 2
        self.linear_2 = self.out1.dot(self.w2)
        self.out2 = self.sigmoid(self.linear_2)

        # Hidden layer to softmax layer
        self.linear3 = self.out2.dot(self.w3)
        self.out3 = self.softmax(self.linear3)

        return self.out3

    def backward(self, x_train, y_train, output):
        # Convert tensors to numpy arrays
        x_train = x_train.numpy()
        y_train = y_train.numpy()

        batch_size = y_train.shape[0]

        # Derivative of loss
        d_loss = output - y_train
        # Calculating delta for W3
        change_w3 = (1. / batch_size) * np.matmul(self.out2.T, d_loss)

        # Backpropagating to the 2nd layer from the third layer
        d_out_2 = np.matmul(d_loss, self.w3.T)
        d_linear_2 = d_out_2 * self.sigmoid(self.linear_2) * (1 - self.sigmoid(self.linear_2))
        # Calculating delta for W2
        change_w2 = (1. / batch_size) * np.matmul(self.out1.T, d_linear_2)

        # Backpropagating to the 1nd layer from the second layer
        d_out_1 = np.matmul(d_loss, self.w3.T) * self.sigmoid(self.linear_2) * (1 - self.sigmoid(self.linear_2))
        d_out_1 = np.matmul(d_out_1, self.w2.T)
        d_linear_1 = d_out_1 * self.sigmoid(self.linear_1) * (1 - self.sigmoid(self.linear_1))
        # Calculating delta for W2
        change_w1 = (1. / batch_size) * np.matmul(x_train.T, d_linear_1)

        return change_w1, change_w2, change_w3

    def update_weights(self, w1_update, w2_update, w3_update):
        self.w1 -= self.l_rate * w1_update
        self.w2 -= self.l_rate * w2_update
        self.w3 -= self.l_rate * w3_update

    def compute_loss(self, y, y_hat):
        batch_size = y.shape[0]
        y = y.numpy()
        # Computing the cross entropy loss for the model and its given predictions
        loss = np.sum(np.multiply(y, np.log(y_hat)))
        loss = -(1. / batch_size) * loss
        return loss

    def compute_metrics(self, val_loader):
        losses = []
        correct = 0
        total = 0
        for i, data in enumerate(val_loader):
            x, y = data
            # Converting to expected one-hot format
            y_onehot = torch.zeros(y.shape[0], 10)
            y_onehot[range(y_onehot.shape[0]), y] = 1
            # Flattening input image into 1-D
            flattened_input = x.view(-1, 28 * 28)
            output = self.forward(flattened_input)
            predicted = np.argmax(output, axis=1)
            # Calculating correctly predicted labels
            correct += np.sum((predicted == y.numpy()))
            total += y.shape[0]
            # Computing the cross entropy loss
            loss = self.compute_loss(y_onehot, output)
            losses.append(loss)
        # Performing mean over all minibatches
        return (correct / total), np.mean(np.array(losses))

    def train(self, train_loader, val_loader):
        start_time = time.time()
        global losses
        global accuracies
        for iteration in range(self.epochs):
            for i, data in enumerate(train_loader):
                x, y = data
                # Since the model is producing a softmax probability over 10 classes, the label needs to be converted to a one-hot encoded vector
                y_onehot = torch.zeros(y.shape[0], 10)
                y_onehot[range(y_onehot.shape[0]), y] = 1
                # Converting 28x28 image into a flattened input
                flattened_input = x.view(-1, 28 * 28)
                # Forward pass the input through the model
                output = self.forward(flattened_input)
                # Compute gradients for the linear layer weights using SGD
                w1_update, w2_update, w3_update = self.backward(flattened_input, y_onehot, output)
                # Perform weight update for the minibatch
                self.update_weights(w1_update, w2_update, w3_update)
            # Compute the mean loss over the test set after the completion of epoch
            accuracy, loss = self.compute_metrics(val_loader)
            losses.append(loss)
            accuracies.append(accuracy)
            print('Epoch: {0}, Time Spent: {1:.2f}s, Accuracy: {2:.2f}%, Loss: {3:.2f}'.format(
                iteration + 1, time.time() - start_time, accuracy * 100, loss
            ))


In [None]:
transform = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5,), (0.5,)),])

losses = []
accuracies = []


input_size = 784
hidden_layer1_size = 300
hidden_layer2_size = 300
output_size = 10



trainset = datasets.MNIST('./dataset/MNIST/', download=True, train=True, transform=transform)
valset = datasets.MNIST('./dataset/MNIST/', download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)

model = NeuralModel(sizes=[784, 300,200, 10], epochs=25)
# Training the model over the MNIST dataset
model.train(train_loader=trainloader, val_loader=valloader)
plt.xlabel('Epochs')
plt.ylabel('Test Loss')
plt.plot(losses)
plt.show()

In [None]:
#sometimes this will cause kernal crush. Use google colab helps