In [1]:
from __future__ import print_function
import argparse
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
batch_size = 64
test_batch_size = 128
epochs = 5
lr = 0.01
cuda = False
seed = 42
optimizer_type = 'SGD' # Adam, RMSprop
momentum = 0.0
weight_decay = 1e-4

use_cuda = cuda and torch.cuda.is_available()

# Set seed 
np.random.seed(seed)
torch.manual_seed(seed)
if use_cuda:
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
# Handel GPU stochasticity
torch.backends.cudnn.enabled = use_cuda
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if use_cuda else "cpu")

In [3]:
class Net(nn.Module):
    def __init__(self, image_size=784):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(image_size, 100, bias=True)
        self.BN = nn.BatchNorm1d(100)
        self.ReLU = nn.ReLU()
        self.fc2 = nn.Linear(100, 10, bias=True)

    def forward(self, x):
        x = self.fc1(x)
        x = self.BN(x)
        x = self.ReLU(x)
        logits = self.fc2(x)
        return logits

In [4]:
# transform data: scaling, augmentation, ...
transform=transforms.Compose([
        transforms.ToTensor(),  # Convert a PIL Image or numpy.ndarray to tensor
        transforms.Normalize((0.1307,), (0.3081,))  # translate by 0.13 and scale by 0.308
        ])

# pytorch has a datasets class with predefined datasets that can be easily downloaded and manipulated
dataset1 = datasets.MNIST('./data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('./data', train=False,
                   transform=transform)

# Data loader combines a dataset and a sampler, and provides an iterable over the given dataset
# Here I set num_workers to 1. Set it to 4 when working in computational rich environments. 
train_loader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size, shuffle=True, num_workers=1)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=test_batch_size, num_workers=1)

# create an instance of our model
model = Net().to(device)
# loss criterion
criterion = nn.CrossEntropyLoss()

# optimizer type
if optimizer_type == 'SGD':
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)
elif optimizer_type == 'Adam':
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=weight_decay)
elif optimizer_type == 'RMSProp':
    optimizer = optim.Adam(model.parameters(), lr=lr, alpha=0.99, eps=1e-08, weight_decay=weight_decay)
else:
    NotImplementedError("optimizer not implemented")

In [5]:
def test():
    # will notify all your layers that you are in eval mode, that way, 
    # batchnorm or dropout layers will work in eval mode instead of training mode
    model.eval()
    
    test_loss = 0
    correct = 0
    
    # torch.no_grad() impacts the autograd engine and deactivate it. 
    # It will reduce memory usage and speed up computations
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device).view(data.shape[0], 28*28), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item() * target.shape[0]  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max logit
            correct += pred.eq(target.view_as(pred)).sum().item() # compare labels with estimation

    test_loss /= len(test_loader.dataset)
    print('\nTest set loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [7]:
model.train()
for epoch in range(1, epochs + 1):
    cumm_loss = 0
    # iterate over the data
    for batch_idx, (data, target) in enumerate(train_loader):
        # flatten the data tensor and move it to the GPU (when using a GPU) 
        data, target = data.to(device).view(data.shape[0], 28*28), target.to(device)
        
        # Important! sets the gradients of all optimized torch.Tensors to zero. This is because by default, 
        # gradients are accumulated in buffers( i.e, not overwritten) whenever .backward() is called.
        optimizer.zero_grad()
        
        # pass data through the model
        logits = model(data)
        
        # suffer loss
        loss = criterion(logits, target)
        cumm_loss += loss.item()
        
        # Use autograd to compute the backward pass. This call will compute the gradient of loss with respect 
        # to all Tensors with requires_grad=True. This operation will free the computation graph
        loss.backward()
        
        # Calling the step function on an Optimizer makes an update to its parameters
        optimizer.step()
        
    print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, cumm_loss / len(train_loader)))
    
test()

Train Epoch: 1 	Loss: 0.171702
Train Epoch: 2 	Loss: 0.158831
Train Epoch: 3 	Loss: 0.145646
Train Epoch: 4 	Loss: 0.135475
Train Epoch: 5 	Loss: 0.126800

Test set loss: 0.1217, Accuracy: 9682/10000 (97%)

