In [1]:
# Visualize MNIST dataset in Visdom
# Train MNIST
# Plot loss curve with Visdom

In [2]:
# First, open Terminal and type:

# python -m visdom.server

# Then, open browser and go to http://localhost:8097/

In [3]:
import torch

In [4]:
from torchvision.datasets import MNIST

In [5]:
from torchvision import transforms

In [6]:
# TODO: talk more about what these compose options do
transf = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5, ), (1.0, ))])

In [7]:
train_set = MNIST(root='./data', train=True, transform=transf, download=False) # load training dataset

In [8]:
train_set.train_data.shape # see dataset size

torch.Size([60000, 28, 28])

torch.Size([60000, 28, 28])

In [9]:
train_set.train_data[0]

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,

tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0],
        [   0,    0,

In [10]:
# To visualize the samples in dataset, let's use Visdom
from visdom import Visdom
vis = Visdom()

In [11]:
vis.image(train_set.train_data[0]) # See image in Visdom page

'window_3629878c6dce0a'

'window_3629878c6dce0a'

In [12]:
train_set.train_labels[0]

tensor(5)

tensor(5)

In [13]:
test_set = MNIST(root='./data', train=False, transform=transf, download=False) # load training dataset

In [14]:
from torchvision.utils import make_grid # Combine images into a nice-looking grid layout

In [15]:
# first 10 samples combined together
# MNIST dataset doesn't have channel dimension, so we use unsqueeze to manually add it
print(train_set.train_data.shape)
print(train_set.train_data.unsqueeze(1).shape)
combined_image = make_grid(train_set.train_data[:10].unsqueeze(1))

torch.Size([60000, 28, 28])
torch.Size([60000, 1, 28, 28])
torch.Size([60000, 28, 28])
torch.Size([60000, 1, 28, 28])


In [16]:
vis.image(combined_image)

'window_3629878c7e7a50'

'window_3629878c7e7a50'

In [17]:
# Better way to visualize how good our model does
# For each target value (0-9), see how often our model gets it right
def confusion(model, n, dataset):
    conf = torch.zeros(n, n)
    model.eval() # Put model in evaluation mode
    for data, target in dataset:
        data = data.unsqueeze(1)
        output = model(data)
        _, pred = torch.max(output.data, 1)
        conf[target][pred[0]] += 1
        
    # Normalize
    for i in range(n):
        conf[i] = conf[i] / conf[i].sum()
        
    vis.image(conf)
    
# A perfect model should show a diagonal white line

In [18]:
import torch.nn as nn
import torch.nn.functional as F

In [19]:
# Define our own CNN
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.fc1 = nn.Linear(20 * 4 * 4, 50)
        self.fc2 = nn.Linear(50, 10)
        
    def forward(self, x):
        # we can also use functional interface for Conv2d and pass in the weights and biases, useful in weight sharing scenario
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2(x), 2))
        x = x.view(-1, 20 * 4 * 4) # -1 means we only care about the size of the last dimension, and just merge the rest of the dimensions into one
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=-1)

In [20]:
from torch import optim

In [21]:
from torch.optim import lr_scheduler

In [22]:
from torch.utils.data import TensorDataset, DataLoader

In [23]:
def train_model(model, train_set, test_set, nepochs):
    batch_size = 100
    # dataloader can help you get batched data and shuffle the data
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_set)
    
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    
    model.train()
    
    for epoch in range(nepochs):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            optimizer.zero_grad() # This will zero out all the gradients for the model's parameters
            output = model(data) # This is a softmax output over the possible labels
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            
            if batch_idx % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), float(loss)))

        model.eval()
        test_loss = 0.0
        correct = 0
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, size_average=False).data.item()
            
            _, pred = torch.max(output.data, 1)
            
            correct += (pred == target.data).sum()
            
        test_loss /= len(test_set)
        print("Loss: ", test_loss, "Accuracy: ", float(correct)/len(test_set)) # TODO: use Visdom to plot the loss curve

In [24]:
model = CNN()

In [25]:
train_model(model, train_set, test_set, 10)

Loss:  0.2870850748104363 Accuracy:  0.9171
Loss:  0.2870850748104363 Accuracy:  0.9171


Loss:  0.15707873255895274 Accuracy:  0.9539
Loss:  0.15707873255895274 Accuracy:  0.9539


Loss:  0.1126186316489756 Accuracy:  0.9648
Loss:  0.1126186316489756 Accuracy:  0.9648
Loss:  0.09032437564415531 Accuracy:  0.9724
Loss:  0.09032437564415531 Accuracy:  0.9724


Loss:  0.0785781591939229 Accuracy:  0.974
Loss:  0.0785781591939229 Accuracy:  0.974


Loss:  0.0696228728664324 Accuracy:  0.9773
Loss:  0.0696228728664324 Accuracy:  0.9773


Loss:  0.06639996984394331 Accuracy:  0.9769
Loss:  0.06639996984394331 Accuracy:  0.9769
Loss:  0.05440689856740562 Accuracy:  0.9836
Loss:  0.05440689856740562 Accuracy:  0.9836


Loss:  0.054457149425999024 Accuracy:  0.9821
Loss:  0.054457149425999024 Accuracy:  0.9821


Loss:  0.05136365399694581 Accuracy:  0.983
Loss:  0.05136365399694581 Accuracy:  0.983


In [28]:
confusion(model, 10, test_set) # See confusion matrix

In [29]:
# Save model state as checkpoint, to be used later
torch.save(model.state_dict(), 'mnist.pth')

# Load model into another variable
new_model = CNN()
new_model.load_state_dict(torch.load('mnist.pth'))

In [30]:
new_model

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

In [31]:
new_model

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

In [32]:
model

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

CNN(
  (conv1): Conv2d(1, 10, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(10, 20, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=320, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
)

In [None]:
# TODO: for transfer learning, make sure the two linear layers are replaced