## Classification using self written Gated Recurrent Unit (GRU) model

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
from itertools import chain

In [4]:
'''
STEP 1: LOADING DATASET
'''
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)
 
test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())
 
'''
STEP 2: MAKING DATASET ITERABLE
'''
 
batch_size = 100
n_iters = 6000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
 
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)
 
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)
 
'''
STEP 3: CREATE MODEL CLASS: GRU
'''
 
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(GRUModel, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim
         
        # Number of hidden layers
        self.layer_dim = layer_dim
        
        # A dict to store all the models used in LSTM
        self.model_dict={}
        # each list stores sub-models in different layers 
        self.model_dict['wb_ir']=[] # w: weights; b: biases; ii: from input calculate reset gate
        self.model_dict['wb_hr']=[] # hi: from hidden layer calculate reset gate
        self.model_dict['wb_iz']=[] # if: from input calculate input gate
        self.model_dict['wb_hz']=[]
        self.model_dict['wb_in']=[] # io: from input calculate new gate
        self.model_dict['wb_hn']=[]

        self.sigmoid=nn.Sigmoid() # Sigmoid used in calculating gates
        self.tanh=nn.Tanh() # tanh used in calculating g and output
        
        input_dim_list=[input_dim] # All layers have the same input dimension as hidden dimension, except the first layer
        for layer in range(layer_dim-1):
            input_dim_list.append(hidden_dim)
        for layer in range(layer_dim):
            # Reset gate
            self.model_dict['wb_ir'].append(nn.Linear(input_dim_list[layer],hidden_dim))
            self.model_dict['wb_hr'].append(nn.Linear(hidden_dim,hidden_dim))
            # Input gate
            self.model_dict['wb_iz'].append(nn.Linear(input_dim_list[layer],hidden_dim))
            self.model_dict['wb_hz'].append(nn.Linear(hidden_dim,hidden_dim))
            # New gate
            self.model_dict['wb_in'].append(nn.Linear(input_dim_list[layer],hidden_dim))
            self.model_dict['wb_hn'].append(nn.Linear(hidden_dim,hidden_dim))

        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def parameters(self): # A generator of all parameters of all sub-models
        parameter_generator=None # Initialize as None
        for group_key in self.model_dict:
            for sub_model in self.model_dict[group_key]: # sub_model is a single model in a certain layer (such as nn.Linear in layer 2)
                if parameter_generator is None:
                    parameter_generator=sub_model.parameters()
                else:
                    parameter_generator=chain(parameter_generator,sub_model.parameters())
        parameter_generator=chain(parameter_generator,self.fc.parameters())
        return parameter_generator
    
    def cuda(self):
        for group_key in self.model_dict:
            for sub_model in self.model_dict[group_key]: # sub_model is a single model in a certain layer (such as nn.Linear in layer 2)
                # print(sub_model)
                sub_model.cuda()
        self.fc.cuda()
    
    def forward(self, x): # Tensor shape of x: batch_size*seq_dim*input_dim
        # Initialize hidden state with zeros
        if torch.cuda.is_available():
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).cuda()) # x.size(0): batch_size
        else:
            h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
        # A list of all hidden states of each time point
        h_list=[h0]
        """Use list instead of a big Variable to store everything! 
            In order to make sure a certain Variable is only updated once. 
            Because the updated value will be needed for computating gradients in backpropagation!
            Otherwise there will be error: 
            one of the variables needed for gradient computation has been modified by an inplace operation"""
           
        time_steps = x.size(1) # number of time steps
        for step in range(time_steps):
            # One time step
            # Initialize current hidden state and cell state
            if torch.cuda.is_available():
                curr_h = [Variable(torch.zeros(x.size(0), self.hidden_dim).cuda()) for i in range(self.layer_dim)]
                curr_c = [Variable(torch.ones(x.size(0), self.hidden_dim).cuda()) for i in range(self.layer_dim)]
            else:
                curr_h = [Variable(torch.zeros(x.size(0), self.hidden_dim)) for i in range(self.layer_dim)]
                curr_c = [Variable(torch.ones(x.size(0), self.hidden_dim)) for i in range(self.layer_dim)]

            for layer in range(self.layer_dim):
                if layer==0: # The input to the first layer is the raw input: x[:, step, :]
                    reset_gate=self.sigmoid(self.model_dict['wb_ir'][layer](x[:, step, :])+self.model_dict['wb_hr'][layer](h_list[-1][layer])) # h_list[-1] is the hidden state of last time step
                    input_gate=self.sigmoid(self.model_dict['wb_iz'][layer](x[:, step, :])+self.model_dict['wb_hz'][layer](h_list[-1][layer]))
                    new_gate=self.tanh(self.model_dict['wb_in'][layer](x[:, step, :]) + reset_gate * self.model_dict['wb_hn'][layer](h_list[-1][layer]))

                else: # The input to the other layers are input processed by the previous layers: curr_h[layer-1]
                    reset_gate=self.sigmoid(self.model_dict['wb_ir'][layer](curr_h[layer-1])+self.model_dict['wb_hr'][layer](h_list[-1][layer])) # h_list[-1] is the hidden state of last time step
                    input_gate=self.sigmoid(self.model_dict['wb_iz'][layer](curr_h[layer-1])+self.model_dict['wb_hz'][layer](h_list[-1][layer]))
                    new_gate=self.tanh(self.model_dict['wb_in'][layer](curr_h[layer-1]) + reset_gate * self.model_dict['wb_hn'][layer](h_list[-1][layer]))

                curr_h[layer] = (1 - input_gate) * new_gate + input_gate * h_list[-1][layer] # update current layer of current hidden state
            
            h_list.append(curr_h)
        
        # Classification
        out = self.fc(h_list[time_steps][self.layer_dim-1]) # Output of the last time step
        # out.size() --> 100, 10
        return out
 
'''
STEP 4: INSTANTIATE MODEL CLASS
'''
input_dim = 28
hidden_dim = 100
layer_dim = 3  # ONLY CHANGE IS HERE FROM ONE LAYER TO TWO LAYER
output_dim = 10
 
model = GRUModel(input_dim, hidden_dim, layer_dim, output_dim)
 
if torch.cuda.is_available():
    model.cuda()
     
'''
STEP 5: INSTANTIATE LOSS CLASS
'''
criterion = nn.CrossEntropyLoss()
 
'''
STEP 6: INSTANTIATE OPTIMIZER CLASS
'''
learning_rate = 0.1
 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  
 
'''
STEP 7: TRAIN THE MODEL
'''
 
# Number of steps to unroll
seq_dim = 28 
 
iter = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Load images as Variable
        if torch.cuda.is_available():
            images = Variable(images.view(-1, seq_dim, input_dim).cuda())
            labels = Variable(labels.cuda())
        else:
            images = Variable(images.view(-1, seq_dim, input_dim))
            labels = Variable(labels)
             
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
         
        # Forward pass to get output/logits
        # outputs.size() --> 100, 10
        outputs = model(images)
         
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
         
        # Getting gradients w.r.t. parameters
        loss.backward()
         
        # Updating parameters
        optimizer.step()
         
        iter += 1
         
        if iter % 300 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in test_loader:
                if torch.cuda.is_available():
                    images = Variable(images.view(-1, seq_dim, input_dim).cuda())
                else:
                    images = Variable(images.view(-1, seq_dim, input_dim))
                 
                # Forward pass only to get logits/output
                outputs = model(images)
                 
                # Get predictions from the maximum value
                _, predicted = torch.max(outputs.data, 1)
                 
                # Total number of labels
                total += labels.size(0)
                 
                # Total correct predictions
                if torch.cuda.is_available():
                    correct += (predicted.cpu() == labels.cpu()).sum()
                else:
                    correct += (predicted == labels).sum()
             
            accuracy = 100 * correct / total
             
            # Print Loss
            print('Iteration: {}. Loss: {}. Accuracy: {}'.format(iter, loss.data[0], accuracy))
            
            

Iteration: 300. Loss: 1.2674241065979004. Accuracy: 57.31
Iteration: 600. Loss: 0.5368714332580566. Accuracy: 80.58
Iteration: 900. Loss: 0.29442766308784485. Accuracy: 88.35
Iteration: 1200. Loss: 0.18822206556797028. Accuracy: 92.07
Iteration: 1500. Loss: 0.16348975896835327. Accuracy: 94.54
Iteration: 1800. Loss: 0.3253040313720703. Accuracy: 94.11
Iteration: 2100. Loss: 0.07103712111711502. Accuracy: 95.69
Iteration: 2400. Loss: 0.1422758549451828. Accuracy: 96.68
Iteration: 2700. Loss: 0.051988113671541214. Accuracy: 97.06
Iteration: 3000. Loss: 0.0869653970003128. Accuracy: 96.86
Iteration: 3300. Loss: 0.026672834530472755. Accuracy: 97.28
Iteration: 3600. Loss: 0.06405368447303772. Accuracy: 97.8
Iteration: 3900. Loss: 0.014708809554576874. Accuracy: 97.63
Iteration: 4200. Loss: 0.03964027389883995. Accuracy: 96.32
Iteration: 4500. Loss: 0.054002925753593445. Accuracy: 97.89
Iteration: 4800. Loss: 0.037645693868398666. Accuracy: 97.88
Iteration: 5100. Loss: 0.009342103265225887.