# Imports

In [1]:
import torch
import torch.nn as nn # all nn modules
import torch.optim as optim # optimization algorithms
import torch.nn.functional as F # activation functions like relu, tanh (all functions with no parameters)
from torch.utils.data import DataLoader # helps with daata
import torchvision.datasets as datasets # has many data sets
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

# Create a fully connected NN

In [2]:
# inherits from the nn module
# Our first linear layer take input_size, in this case 784 nodes to 512
# and our second linear layer takes 512 to the num_classes we have, in this case 10.
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN, self).__init__()
        # input layer
        self.fc1 = nn.Linear(input_size, 512)
        # self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, num_classes)
    
    # create a forward function
    def forward(self, x):
        x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x

In [3]:
# 28*28 images passes as 784
# 10 for no of digits
model = NN(784, 10)
# 64 = no of examples (images) mini batch size
x = torch.rand((64, 784))
model(x).shape

torch.Size([64, 10])

In [4]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Hyperparameters

In [5]:
input_size = 784
num_classes = 10
learning_rate = 0.001
# how many data examples we pass in one iteration
batch_size = 64
epochs = 3

# Load dataset

In [6]:
train_data = datasets.MNIST(root = "data/", train=True, transform=transforms.ToTensor())
# We pass the Dataset as an argument to DataLoader
# This wraps an iterable over our dataset, and supports automatic batching, sampling, shuffling and multiprocess data loading
# Here we define a batch size of 64, i.e. each element in the dataloader iterable will return a batch of 64 features and labels
train_loader = DataLoader(dataset=train_data, batch_size = batch_size, shuffle=True)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [7]:
test_data = datasets.MNIST(root = "data/", train=False, transform=transforms.ToTensor())
test_loader = DataLoader(dataset=test_data, batch_size = batch_size, shuffle=True)

In [8]:
for X, y in test_loader:
    print("Shape of X [N, C, H, W]: ", X.shape)
    print("Shape of y: ", y.shape, y.dtype)
    break

Shape of X [N, C, H, W]:  torch.Size([64, 1, 28, 28])
Shape of y:  torch.Size([64]) torch.int64


# Initialize the network

In [9]:
model = NN(input_size = input_size, num_classes = num_classes).to(device)

# Loss & optimizer

In [10]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the network

In [11]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # X has shape (64, 1, 28, 28)
        # 64 training examples
        # 1 as we are using gray scale images
        # 28 * 28 height, width        
        # we need to reshape this to (64, 784)
        
        X = X.reshape(X.shape[0],-1)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        # set gradients to zero for each batch, so it does not store back prop calculation from previous forward props
        optimizer.zero_grad()
        loss.backward()
        
        # Gradient step
        optimizer.step()

        if batch % 100 == 0:
            # print(f"batch: {batch}, len:{len(X)}, current: {batch*len(X)}")
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [12]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    
    num_batches = len(dataloader)
    
    model.eval()
    test_loss, correct = 0, 0
    
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            X = X.reshape(X.shape[0],-1)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [13]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(test_loader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.289479  [    0/60000]
loss: 0.478678  [ 6400/60000]
loss: 0.266408  [12800/60000]
loss: 0.251128  [19200/60000]
loss: 0.238137  [25600/60000]
loss: 0.169743  [32000/60000]
loss: 0.076256  [38400/60000]
loss: 0.158224  [44800/60000]
loss: 0.194065  [51200/60000]
loss: 0.122357  [57600/60000]
Test Error: 
 Accuracy: 96.1%, Avg loss: 0.130064 

Epoch 2
-------------------------------
loss: 0.064512  [    0/60000]
loss: 0.228716  [ 6400/60000]
loss: 0.024169  [12800/60000]
loss: 0.099922  [19200/60000]
loss: 0.153369  [25600/60000]
loss: 0.082167  [32000/60000]
loss: 0.172135  [38400/60000]
loss: 0.179974  [44800/60000]
loss: 0.048316  [51200/60000]
loss: 0.038613  [57600/60000]
Test Error: 
 Accuracy: 97.3%, Avg loss: 0.084861 

Epoch 3
-------------------------------
loss: 0.094776  [    0/60000]
loss: 0.031319  [ 6400/60000]
loss: 0.041012  [12800/60000]
loss: 0.124402  [19200/60000]
loss: 0.036856  [25600/60000]
loss: 0.070971  [32000/600