In [2]:
import torch
import random
import numpy as np
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

In [3]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters 
# input_size = 784 # 28x28
num_classes = 10
num_epochs = 2
batch_size = 100
learning_rate = 0.001

input_size = 28
sequence_length = 28
hidden_size = 128
num_layers = 2

In [5]:
from six.moves import urllib    
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

In [6]:
# MNIST dataset 
train_dataset = torchvision.datasets.MNIST(root='./data', 
                                           train=True, 
                                           transform=transforms.ToTensor(),  
                                           download=True)

test_dataset = torchvision.datasets.MNIST(root='./data', 
                                          train=False, 
                                          transform=transforms.ToTensor())

# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)


0it [00:00, ?it/s][ADownloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz

  0%|          | 0/9912422 [00:00<?, ?it/s][A
  0%|          | 16384/9912422 [00:00<01:14, 132679.22it/s][A
  1%|          | 81920/9912422 [00:00<00:24, 401763.36it/s][A
  1%|▏         | 147456/9912422 [00:00<00:19, 509027.51it/s][A
  3%|▎         | 303104/9912422 [00:00<00:10, 899585.34it/s][A
  6%|▌         | 565248/9912422 [00:01<00:06, 1486424.84it/s][A
  9%|▉         | 917504/9912422 [00:01<00:04, 2126516.97it/s][A
 15%|█▌        | 1499136/9912422 [00:01<00:02, 3292542.39it/s][A
 20%|█▉        | 1974272/9912422 [00:01<00:02, 3741149.12it/s][A
 24%|██▍       | 2359296/9912422 [00:01<00:02, 3752040.64it/s][A
 29%|██▉       | 2875392/9912422 [00:01<00:01, 4159323.57it/s][A
 33%|███▎      | 3293184/9912422 [00:01<00:01, 3412664.68it/s][A
 40%|████      | 3973120/9912422 [00:01<00:01, 3957611.97it/s][A
 44%|████▍     | 4382720/991242

In [18]:
iter(test_loader).next()[0].shape

0it [1:30:41, ?it/s]


torch.Size([100, 1, 28, 28])

In [7]:
# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        # -> x needs to be: (batch_size, seq, input_size)
        
        # or:
        #self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Set initial hidden states (and cell states for LSTM)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        #c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        
        # x: (n, 28, 28), h0: (2, n, 128)
        
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)  
        # or:
        #out, _ = self.lstm(x, (h0,c0))  
        
        # out: tensor of shape (batch_size, seq_length, hidden_size)
        # out: (n, 28, 128)
        
        # Decode the hidden state of the last time step
        out = out[:, -1, :]
        # out: (n, 128)
         
        out = self.fc(out)
        # out: (n, 10)
        return out

In [8]:
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [9]:
# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [N, 1, 28, 28]
        # resized: [N, 28, 28]
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')


Epoch [1/2], Step [100/600], Loss: 0.7935
Epoch [1/2], Step [200/600], Loss: 0.7263
Epoch [1/2], Step [300/600], Loss: 0.4443
Epoch [1/2], Step [400/600], Loss: 0.4806
Epoch [1/2], Step [500/600], Loss: 0.4036
Epoch [1/2], Step [600/600], Loss: 0.4303
Epoch [2/2], Step [100/600], Loss: 0.3552
Epoch [2/2], Step [200/600], Loss: 0.2609
Epoch [2/2], Step [300/600], Loss: 0.2514
Epoch [2/2], Step [400/600], Loss: 0.2695
Epoch [2/2], Step [500/600], Loss: 0.2530
Epoch [2/2], Step [600/600], Loss: 0.1610


In [10]:
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

Accuracy of the network on the 10000 test images: 94.92 %
