In [1]:
import os
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torch
import torch.nn as nn
import torchvision
from torchvision import datasets, models, transforms
import math

In [2]:
class GRU(nn.Module):

    def __init__(self, in_sz, hid_sz, bias=True):
        super(GRU, self).__init__()
        self.in_sz = in_sz
        self.hid_sz = hid_sz
        self.bias = bias
        self.layer1 = nn.Linear(in_sz, 3 * hid_sz, bias=bias)
        self.layer2 = nn.Linear(hid_sz, 3 * hid_sz, bias=bias)
        self.reset_parameters()

    def reset_parameters(self):
        std = 1.0 / math.sqrt(self.hid_sz)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, hidden):

        x = x.view(-1, x.size(1))

        gate_x = self.layer1(x) 
        gate_h = self.layer2(hidden)

        gate_x = gate_x.squeeze()
        gate_h = gate_h.squeeze()
        i_r, i_i, i_n = gate_x.chunk(3, 1)
        h_r, h_i, h_n = gate_h.chunk(3, 1)


        reset_gate = torch.sigmoid(i_r + h_r)
        input_gate = torch.sigmoid(i_i + h_i)
        next_gate = torch.tanh(i_n + (reset_gate * h_n))

        hy = next_gate + input_gate * (hidden - next_gate)

        return hy

#create GRU model class
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim, bias=True):
        super(GRUModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.gru = GRU(input_dim, hidden_dim, layer_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    
    def forward(self, x):
        # Initialize hidden state with zeros
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda())
        else:
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
       
        outs = []
        
        hn = h0[0,:,:]
        #for 28 time steps
        for seq in range(x.size(1)):
            hn = self.gru(x[:,seq,:], hn) 
            outs.append(hn)
            
        # Index hidden state of last time step
        out = outs[-1].squeeze()
        
        out = self.fc(out) 
        # out.size() --> 100, 10
        return out

In [3]:
# Downloading and Loading Dataset
train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(),download=True)
 
test_dataset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor())

In [4]:
B_SIZE = 256

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=B_SIZE, 
                                           shuffle=True) 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=B_SIZE,
                                          shuffle=False)

In [5]:
def train_epoch(model, train_loader, optimizer, criterion, epoch, device,seq_dim):
    """ Training a model for one epoch """
    
    loss_list = []
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader))
    for i, (images, labels) in progress_bar:
        if torch.cuda.is_available():
            images = Variable(images.view(-1, seq_dim, input_dim).cuda())
        else:
            images = Variable(images.view(-1 , seq_dim, input_dim))
        labels = labels.to(device)
        
                    
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
         
        
        # Getting gradients w.r.t. parameters
        loss.backward()
         
        # Updating parameters
        optimizer.step()
        
        progress_bar.set_description(f"Epoch {epoch+1} Iter {i+1}: loss {loss.item():.5f}. ")
        
    mean_loss = np.mean(loss_list)
    return mean_loss, loss_list


@torch.no_grad()
def eval_model(model, eval_loader, criterion, device,seq_dim):
    """ Evaluating the model for either validation or test """
    correct = 0
    total = 0
    loss_list = []
    
    for i, (images, labels) in enumerate(eval_loader):
          
        if torch.cuda.is_available():
            images = Variable(images.view(-1, seq_dim, input_dim).cuda())
            labels = Variable(labels.cuda())
        else:
            images = Variable(images.view(-1, seq_dim, input_dim))
            labels = Variable(labels)
        
        
        # Forward pass only to get logits/output
        outputs = model(images)
                 
        loss = criterion(outputs, labels)
        loss_list.append(loss.item())
            
        # Get predictions from the maximum value
        preds = torch.argmax(outputs, dim=1)
        correct += len( torch.where(preds==labels)[0] )
        total += len(labels)
                 
    # Total correct predictions and loss
    accuracy = correct / total * 100
    loss = np.mean(loss_list)
    
    return accuracy, loss


def train_model(model, optimizer, scheduler, criterion, train_loader, valid_loader, num_epochs,seq_dim):
    """ Training a model for a given number of epochs"""
    
    train_loss = []
    val_loss =  []
    loss_iters = []
    valid_acc = []
    
    for epoch in range(num_epochs):
           
        # validation epoch
        model.eval()  # important for dropout and batch norms
        accuracy, loss = eval_model(
                    model=model, eval_loader=valid_loader,
                    criterion=criterion, device=device,seq_dim=seq_dim
            )
        valid_acc.append(accuracy)
        val_loss.append(loss)
        
        # training epoch
        model.train()  # important for dropout and batch norms
        mean_loss, cur_loss_iters = train_epoch(
                model=model, train_loader=train_loader, optimizer=optimizer,
                criterion=criterion, epoch=epoch, device=device,seq_dim=seq_dim
            )
        scheduler.step()
        train_loss.append(mean_loss)
        loss_iters = loss_iters + cur_loss_iters
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"    Train loss: {round(mean_loss, 5)}")
        print(f"    Valid loss: {round(loss, 5)}")
        print(f"    Accuracy: {accuracy}%")
        print("\n")
    
    print(f"Training completed")
    return train_loss, val_loss, loss_iters, valid_acc


def smooth(f, K=5):
    """ Smoothing a function using a low-pass filter (mean) of size K """
    kernel = np.ones(K) / K
    f = np.concatenate([f[:int(K//2)], f, f[int(-K//2):]])  # to account for boundaries
    smooth_f = np.convolve(f, kernel, mode="same")
    smooth_f = smooth_f[K//2: -K//2]  # removing boundary-fixes
    return smooth_f

def count_model_params(model):
    """ Counting the number of learnable parameters in a nn.Module """
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return num_params

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
input_dim = 28
hidden_dim = 128
layer_dim = 1  
output_dim = 10

In [8]:
model_GRU = GRUModel(input_dim, hidden_dim, layer_dim, output_dim)


In [9]:
model_GRU = model_GRU.to(device)
print(model_GRU)

GRUModel(
  (gru): GRU(
    (layer1): Linear(in_features=28, out_features=384, bias=True)
    (layer2): Linear(in_features=128, out_features=384, bias=True)
  )
  (fc): Linear(in_features=128, out_features=10, bias=True)
)


In [10]:
# classification loss function
criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer = torch.optim.Adam(model_GRU.parameters(), lr=3e-4)

# Decay LR by a factor of 0.1 every 5 epochs
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.2)

In [11]:
train_loss, val_loss, loss_iters, valid_acc = train_model(
        model=model_GRU, optimizer=optimizer, scheduler=scheduler, criterion=criterion,
        train_loader=train_loader, valid_loader=test_loader, num_epochs=20,seq_dim=28
    )

Epoch 1 Iter 235: loss 1.10431. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.87it/s]


Epoch 1/20
    Train loss: 1.55078
    Valid loss: 2.30515
    Accuracy: 9.4%




Epoch 2 Iter 235: loss 0.49073. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.85it/s]


Epoch 2/20
    Train loss: 0.69174
    Valid loss: 0.83831
    Accuracy: 72.41%




Epoch 3 Iter 235: loss 0.51085. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.83it/s]


Epoch 3/20
    Train loss: 0.49299
    Valid loss: 0.56083
    Accuracy: 81.82000000000001%




Epoch 4 Iter 235: loss 0.45847. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.83it/s]


Epoch 4/20
    Train loss: 0.36696
    Valid loss: 0.41034
    Accuracy: 87.31%




Epoch 5 Iter 235: loss 0.29104. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.82it/s]


Epoch 5/20
    Train loss: 0.28554
    Valid loss: 0.30923
    Accuracy: 90.62%




Epoch 6 Iter 235: loss 0.24356. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.87it/s]


Epoch 6/20
    Train loss: 0.24216
    Valid loss: 0.24634
    Accuracy: 92.62%




Epoch 7 Iter 235: loss 0.29799. : 100%|██████████████████████████████████████████████| 235/235 [00:39<00:00,  5.89it/s]


Epoch 7/20
    Train loss: 0.2316
    Valid loss: 0.22904
    Accuracy: 93.37%




Epoch 8 Iter 235: loss 0.26963. : 100%|██████████████████████████████████████████████| 235/235 [00:41<00:00,  5.63it/s]


Epoch 8/20
    Train loss: 0.22243
    Valid loss: 0.22043
    Accuracy: 93.47999999999999%




Epoch 9 Iter 235: loss 0.21602. : 100%|██████████████████████████████████████████████| 235/235 [00:40<00:00,  5.82it/s]


Epoch 9/20
    Train loss: 0.21378
    Valid loss: 0.21149
    Accuracy: 93.75%




Epoch 10 Iter 235: loss 0.20722. : 100%|█████████████████████████████████████████████| 235/235 [00:40<00:00,  5.81it/s]


Epoch 10/20
    Train loss: 0.20486
    Valid loss: 0.20214
    Accuracy: 93.99%




Epoch 11 Iter 235: loss 0.25198. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.88it/s]


Epoch 11/20
    Train loss: 0.19867
    Valid loss: 0.19535
    Accuracy: 94.31%




Epoch 12 Iter 235: loss 0.18078. : 100%|█████████████████████████████████████████████| 235/235 [00:40<00:00,  5.87it/s]


Epoch 12/20
    Train loss: 0.19671
    Valid loss: 0.19232
    Accuracy: 94.31%




Epoch 13 Iter 235: loss 0.12293. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.89it/s]


Epoch 13/20
    Train loss: 0.19498
    Valid loss: 0.19101
    Accuracy: 94.36%




Epoch 14 Iter 235: loss 0.25450. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.88it/s]


Epoch 14/20
    Train loss: 0.19379
    Valid loss: 0.18945
    Accuracy: 94.45%




Epoch 15 Iter 235: loss 0.30398. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.93it/s]


Epoch 15/20
    Train loss: 0.1923
    Valid loss: 0.1879
    Accuracy: 94.5%




Epoch 16 Iter 235: loss 0.21057. : 100%|█████████████████████████████████████████████| 235/235 [00:40<00:00,  5.83it/s]


Epoch 16/20
    Train loss: 0.1908
    Valid loss: 0.1866
    Accuracy: 94.61%




Epoch 17 Iter 235: loss 0.25099. : 100%|█████████████████████████████████████████████| 235/235 [00:40<00:00,  5.84it/s]


Epoch 17/20
    Train loss: 0.19056
    Valid loss: 0.18614
    Accuracy: 94.58%




Epoch 18 Iter 235: loss 0.09525. : 100%|█████████████████████████████████████████████| 235/235 [00:40<00:00,  5.83it/s]


Epoch 18/20
    Train loss: 0.18981
    Valid loss: 0.18587
    Accuracy: 94.6%




Epoch 19 Iter 235: loss 0.24636. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.90it/s]


Epoch 19/20
    Train loss: 0.18992
    Valid loss: 0.18557
    Accuracy: 94.6%




Epoch 20 Iter 235: loss 0.11437. : 100%|█████████████████████████████████████████████| 235/235 [00:39<00:00,  5.93it/s]

Epoch 20/20
    Train loss: 0.18927
    Valid loss: 0.18529
    Accuracy: 94.58%


Training completed



