In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import numpy as np
print("Importing done!")

Importing done!


In [2]:
batch_size = 32 # Batch size
input_dim = 784 # Input dimension (For MNIST dataset each image is of size 28 x 28 = 784)
num_of_hidden_nodes = 100 # number of hidden nodes in hidden layer
output_dim = 10 # Number of output nodes = no of classes in th dataset. In this case it is 10

learning_rate = 0.1
num_epochs = 5

In [3]:
train_loader = torch.utils.data.DataLoader(datasets.MNIST('../../data', train=True, download=False,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(),
                                                              transforms.Normalize((0.1307,), (0.3081,))])),
                                           batch_size= batch_size, shuffle=True)

In [4]:
def sigmoid(x):
    return 1/torch.exp(x.mul(-1)).add(1)
    

def sigmoid_diff(x):
    return torch.mul(sigmoid(x), sigmoid(x).mul(-1).add(1))

# tensor = torch.FloatTensor([[1,2,3],[1,2,3]])
# print(sigmoid(tensor)) # You can use it for debugging
# torch.sigmoid(tensor)

In [5]:
# Initiliaze the weights
W_1 = torch.randn(input_dim, num_of_hidden_nodes).type(torch.FloatTensor) # Weights between input and hidden layer
W_2 = torch.randn(num_of_hidden_nodes, output_dim).type(torch.FloatTensor) # Weights between hidden layer and output

In [6]:
for epoch in range(0, num_epochs):
    correct = 0
    loss = 0
    y_batch_onehot = torch.FloatTensor(batch_size, output_dim)
    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        # Forward Pass
        x_batch = x_batch.view(-1, 784)
        hidden_state_output = sigmoid(torch.mm(x_batch, W_1))
        output = sigmoid(torch.mm(hidden_state_output, W_2))
        
        # Convert the labels to one hot encoded format
        y_batch_onehot.zero_()
        y_batch_onehot.scatter_(1, y_batch[:, None], 1)
        
        # Loss (Mean-Squared error)     
        loss += (output - y_batch_onehot).pow(2).sum()*0.5
        _, predicted_class = output.max(1)
        correct += predicted_class.eq(y_batch).sum()       
        
        #Backward Pass (Back-Propagation)
        # Derivative of MSE Loss        
        diff = (output - y_batch_onehot)
        grad_w2 = torch.mm(hidden_state_output.t(),torch.mul(diff, sigmoid_diff(output))) # 100 x 10 dimensional
        grad_w1 =  torch.mm(x_batch.t(),torch.mul(torch.mm(torch.mul(diff, sigmoid_diff(output)), W_2.t())
                             ,sigmoid_diff(hidden_state_output))) # 784 x 100
        
        # Perform gradient descent        
        W_1 -= learning_rate*grad_w1
        W_2 -= learning_rate*grad_w2
        
        
    print("Epoch: {0} | loss: {1} | accuracy: {2}".format(epoch, loss/len(train_loader)
                                                          , correct/len(train_loader.dataset)))              

Epoch: 0 | loss: 3.5814903906020006 | accuracy: 0.8759833333333333
Epoch: 1 | loss: 2.579263446009155 | accuracy: 0.9038666666666667
Epoch: 2 | loss: 2.472711461259164 | accuracy: 0.9091333333333333
Epoch: 3 | loss: 2.4186381172526987 | accuracy: 0.9101833333333333
Epoch: 4 | loss: 2.3487378365596916 | accuracy: 0.9146166666666666


In [7]:
from torch.autograd import Variable
# import pdb
learning_rate = 0.1

W_1 = Variable(torch.randn(input_dim, num_of_hidden_nodes).type(torch.FloatTensor).cuda(), requires_grad=True)
W_2 = Variable(torch.randn(num_of_hidden_nodes, output_dim).type(torch.FloatTensor).cuda(), requires_grad=True)
y_batch_onehot = Variable(torch.FloatTensor(batch_size, output_dim).cuda(), requires_grad=True)

for epoch in range(0, num_epochs):
    
    correct = 0
    total_loss = 0
    for batch_idx, (x_batch, y_batch) in enumerate(train_loader):
        
        x_batch = Variable(x_batch.view(-1,784).cuda(), requires_grad=False)
        y_batch = Variable(y_batch.cuda(), requires_grad=False)       
        
        # Forward Pass
        hidden_state_output = torch.sigmoid(torch.mm(x_batch, W_1))
        output = torch.sigmoid(torch.mm(hidden_state_output, W_2))
        
        
        
        
        # Convert the labels to one hot encoded format
        y_batch_onehot.data.zero_()
        y_batch_onehot.data.scatter_(1, y_batch[:, None].data, 1)

        
        # Loss (Mean-Squared error)  
#         pdb.set_trace()
        loss = (output - y_batch_onehot).pow(2).sum().mul(0.5)
        total_loss += loss.data[0]
        loss.backward()

        # Calculate no of correct classifications
        _, predicted_class = output.max(1)
        correct += predicted_class.data.eq(y_batch.data).sum()              
        
       
        
        
        
        W_1.data -= learning_rate * W_1.grad.data
        W_2.data -= learning_rate * W_2.grad.data
                 # Manually zero the gradients before running the backward pass         
        W_1.grad.data.zero_()
        W_2.grad.data.zero_()

    print("Epoch: {0} | loss: {1} | accuracy: {2}".format(epoch, total_loss/len(train_loader)
                                                          , correct/len(train_loader.dataset)))              
        
        

Epoch: 0 | loss: 9.341431047058105 | accuracy: 0.5224333333333333
Epoch: 1 | loss: 6.628281626129151 | accuracy: 0.6561666666666667
Epoch: 2 | loss: 6.187631152788798 | accuracy: 0.6687666666666666
Epoch: 3 | loss: 5.620929303741455 | accuracy: 0.7129333333333333
Epoch: 4 | loss: 3.653312045923869 | accuracy: 0.8355
