In [1]:
import torch
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable

In [2]:
input_size = 784 # 28x28 number of pixels (image resolution)
hidden_size = 400 # number of nodes at hidden layer
out_size = 10 # number of output classes (0-9)
epochs = 10 # how many times we pass our entire dataset into our network
batch_size = 100 # Input size of the data during one iteration
learning_rate = 0.001 # how fast we are learning

In [3]:
train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

In [4]:
# Make data iterable by loading it to a loader. Shuffle data to make it independent of batches.
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

![nn mnist](https://user-images.githubusercontent.com/30661597/61593615-5eb8bf00-ac14-11e9-8087-f880971b3543.png)


In [5]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, out_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) # First fully connected layer
        self.relu = nn.ReLU() # First layer Activation function
        self.fc2 = nn.Linear(hidden_size, hidden_size) # Second fully connected layer
        self.fc3 = nn.Linear(hidden_size, out_size) # Output layer
        
    def forward(self, x): # Forward propagate
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [6]:
# Create an object of the class Net
net = Net(input_size, hidden_size, out_size)
CUDA = torch.cuda.is_available()
if CUDA:
    net = net.cuda()
# Loss function. Cross entropy loss is used for classification problems and comes with softmax. Therefore, no need to specify softmax.
critertion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

In [7]:
net.parameters

<bound method Module.parameters of Net(
  (fc1): Linear(in_features=784, out_features=400, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=400, out_features=400, bias=True)
  (fc3): Linear(in_features=400, out_features=10, bias=True)
)>

In [8]:
#Visualize the train loader
for i, (images, labels) in enumerate(train_loader):
    print(images.size())
    images = images.view(-1, 28*28)
    print(images.size())


torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([100, 1, 28, 28])
torch.Size([100, 784])
torch.Size([

In [9]:
# Train the model
correct_train = 0
total_train = 0
for epoch in range(epochs):
    for i, (images, labels) in enumerate(train_loader):
        #Flatten the images from size (batch_size, 1, 28, 28) to (100, 1, 28, 28) where 1 represents the number of channels (grayscale to 1), to size (100, 784) and wrap them in a Variable object
        images = Variable(images.view(-1, 28*28))
        labels = Variable(labels)
        if CUDA:
            images = images.cuda()
            labels = labels.cuda()
        # Clear the parameter gradients in the optimizer, so it won't accumulate gradients from previous epochs
        optimizer.zero_grad()
        outputs = net(images) # Forward pass
        _, predicted = torch.max(outputs.data, 1) # Return the second argument of torch.max which is the index of the maximum value
        total_train += labels.size(0) # Increment the total count
        if CUDA:
            correct_train += (predicted.cpu() == labels.cpu()).sum() # Increment the correct count
        else:
            correct_train += (predicted == labels).sum()

        loss = critertion(outputs, labels) # Difference between the predicted and actual values (loss function)
        loss.backward() # Backpropagate the loss
        optimizer.step() # Update the weights
        if (i+1) % 100 == 0:
            print('Epoch: {} Iteration: {} Loss: {:.4f} Accuracy: {:.2f}%'.format(epoch+1, i+1, loss.item(), (correct_train/total_train)*100))
print('Finished Training')

Epoch: 1 Iteration: 100 Loss: 0.3323 Accuracy: 82.35%
Epoch: 1 Iteration: 200 Loss: 0.2707 Accuracy: 87.21%
Epoch: 1 Iteration: 300 Loss: 0.1175 Accuracy: 89.30%
Epoch: 1 Iteration: 400 Loss: 0.1452 Accuracy: 90.64%
Epoch: 1 Iteration: 500 Loss: 0.1470 Accuracy: 91.65%
Epoch: 1 Iteration: 600 Loss: 0.0959 Accuracy: 92.36%
Epoch: 2 Iteration: 100 Loss: 0.1028 Accuracy: 92.99%
Epoch: 2 Iteration: 200 Loss: 0.1069 Accuracy: 93.47%
Epoch: 2 Iteration: 300 Loss: 0.0470 Accuracy: 93.86%
Epoch: 2 Iteration: 400 Loss: 0.0988 Accuracy: 94.21%
Epoch: 2 Iteration: 500 Loss: 0.0662 Accuracy: 94.47%
Epoch: 2 Iteration: 600 Loss: 0.0781 Accuracy: 94.69%
Epoch: 3 Iteration: 100 Loss: 0.0603 Accuracy: 94.94%
Epoch: 3 Iteration: 200 Loss: 0.0499 Accuracy: 95.17%
Epoch: 3 Iteration: 300 Loss: 0.0734 Accuracy: 95.36%
Epoch: 3 Iteration: 400 Loss: 0.0094 Accuracy: 95.53%
Epoch: 3 Iteration: 500 Loss: 0.0560 Accuracy: 95.67%
Epoch: 3 Iteration: 600 Loss: 0.0615 Accuracy: 95.79%
Epoch: 4 Iteration: 100 Loss

In [10]:
#Test the model (No loss and weight calculation, no weight update)
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(images.view(-1, 28*28))
    if CUDA:
        images = images.cuda()
        # For each input sample/row in the batch, the output will contain 10 elements with each element indicating the probability of that image belonging to a particular class
        labels = labels.cuda()
    outputs = net(images)
    # We could also write: predicted = outputs.data.max(1)[1]
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0) # Increment the total count (100 in each iteration)
    # We can also write: correct += (predicted.eq(labels).sum())
    if CUDA:
        correct += (predicted.cpu() == labels.cpu()).sum()
    else:
        correct += (predicted == labels).sum()
print('Accuracy of the network on the 10000 test images: {:.2f}%'.format((correct/total)*100))

Accuracy of the network on the 10000 test images: 98.11%
