## Libraries and functions

### Libraries

In [1]:
import torch
import torch.nn as nn
from torch.autograd.variable import Variable

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Functions

In [2]:
def underlined_text(text, char = '='):
    length = len(text)
    print(text)
    for i in range(length):
        print(char, end='')
    print()

## Dataset

### Creating the dataset

In [3]:
underlined_text("X")
X = torch.tensor([[i] for i in range(0,10)]).to(device)
print("Shape = ", X.shape)
print(X)
print()
underlined_text("y")
y = torch.tensor([i for i in range(1,10)])
z = torch.tensor([0])
y = torch.cat((y, z), dim=0).to(device)
print(y.shape)
print(y)
print()
underlined_text("Sequence order: (Input => Output)")
for i in range(len(X)):
    print(X[i].item(), '=>', y[i].item())

X
=
Shape =  torch.Size([10, 1])
tensor([[0],
        [1],
        [2],
        [3],
        [4],
        [5],
        [6],
        [7],
        [8],
        [9]], device='cuda:0')

y
=
torch.Size([10])
tensor([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], device='cuda:0')

Sequence order: (Input => Output)
0 => 1
1 => 2
2 => 3
3 => 4
4 => 5
5 => 6
6 => 7
7 => 8
8 => 9
9 => 0


### Reshaping dataX

Reshaping the dataX into a format [batch_size, seq_len, no_features].

In [4]:
underlined_text("Reshaped X")
X = X.view(X.shape[0], 1, 1) # [batch_size, seq_len, features]
print(X.shape)
print(X)

Reshaped X
torch.Size([10, 1, 1])
tensor([[[0]],

        [[1]],

        [[2]],

        [[3]],

        [[4]],

        [[5]],

        [[6]],

        [[7]],

        [[8]],

        [[9]]], device='cuda:0')


### Normalizing

In [5]:
underlined_text("Normalizing X")
X = X/float(X.shape[0])
print(X.shape)
print(X)

Normalizing X
torch.Size([10, 1, 1])
tensor([[[0.0000]],

        [[0.1000]],

        [[0.2000]],

        [[0.3000]],

        [[0.4000]],

        [[0.5000]],

        [[0.6000]],

        [[0.7000]],

        [[0.8000]],

        [[0.9000]]], device='cuda:0')


## Creating the model

In the default mode (i.e. batch_first = False), the input shape should have the shape [seq_len, batch_size, 
features]. If you want the input shape should be of the shape [batch_size, seq_len, features], set batch_first = True.

seq_len - Number of time steps in each input stream
<br>
features - Number of input features per time-step
<br>
batch_size - Size of the batches (sample size)
<br>
hidden_size - Number of RNN/LSTM blocks
<br>
num_layers - Number of hidden layers (default = 1)
<br>
<br>
<u>Input shapes</u>
<br>
Input's shape = [seq_len, batch_size, features] if batch_first = False
<br>
Input's shape = [batch_size, seq_len, features] if batch_first = True
<br>
Initial hidden layer's shape = [num_layers, batch_size, hidden_size]
<br>
<br>
<u>Hidden layer's shape</u>
<br>
[num_layers, batch_size, hidden_size]
<br>

<u>Output shape</u>
<br>
Hidden layer's shape = [num_layers, batch_size, hidden_size] (Since in an RNN, hidden state also returned as an output)
<br>
Output's shape = [seq_len, batch_size, hidden_size] if batch_first = False
<br>
Output's shape = [batch_size, seq_len, hidden_size] if batch_first = True

### LSTMDigit clsss

#### Class definition

In [6]:
class LSTMDigit(nn.Module):
    def __init__(self, input_size, hidden_size, batch_size, num_layers, output_size):
        super(LSTMDigit, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.output_size = output_size
        
        self.lstm_layer = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size,
                               num_layers=self.num_layers, batch_first=True)
        
        self.linear_layer = nn.Linear(in_features=self.hidden_size, out_features=10)
        
        self.hidden = self.init_hidden() # Initializing the hidden state
    
    def init_hidden(self):
        # Initialize the hidden states
        h0 = torch.randn(self.num_layers, self.batch_size, self.hidden_size).to(device)
        c0 = torch.randn(self.num_layers, self.batch_size, self.hidden_size).to(device)
        return (h0, c0)
        
    def forward(self, x):
        lstm_out, self.hidden = self.lstm_layer(x, self.hidden)
        reshaped_lstm_out = lstm_out.view(self.batch_size, self.hidden_size)
        final_out = self.linear_layer(reshaped_lstm_out)
        return final_out
        
lstm = LSTMDigit(input_size=1, hidden_size=16, batch_size=10, num_layers=1, output_size=10).to(device)
print(lstm)

LSTMDigit(
  (lstm_layer): LSTM(1, 16, batch_first=True)
  (linear_layer): Linear(in_features=16, out_features=10, bias=True)
)


#### Loss function and optimizer

In [7]:
import torch.optim as optim

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm.parameters())

#### Model evaluation

In [8]:
def model_eval(preds, outputs):
    total = 0
    correct = 0
    
    max_vals, pred_class = torch.max(preds, dim=1)
    for i in range(10):
        if (pred_class[i]==outputs[i]):
            correct += 1
    acc = (correct/10.0)*100.0
    return acc

#### Training loop

In the loss.backward() function, the parameter 'retain_graph' needs to be set as 'True', since part of the computational graph will be freed during the execution of the loop. This is not needed in most cases, however, in an RNN, previous hidden states are passed to RNN cells, hence it needs the full graph to calculate the derivatives.

In [9]:
underlined_text("Training loop")
epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()
    y_pred = lstm(X)
    loss = loss_fn(y_pred, y)
    accuracy = model_eval(y_pred, y)
    loss.backward(retain_graph=True)
    optimizer.step()
    if ((epoch+1)%100 == 0):
        print("Epoch: ", "{:4d}/{:4d} ==>".format(epoch+1,epochs), 
              "Loss: ", "{:6.4f}, ".format(loss.item()), "Accuracy: ", "{:4.2f}".format(accuracy))

Training loop
Epoch:   100/ 500 ==> Loss:  2.1902,  Accuracy:  20.00
Epoch:   200/ 500 ==> Loss:  1.8156,  Accuracy:  30.00
Epoch:   300/ 500 ==> Loss:  1.7398,  Accuracy:  40.00
Epoch:   400/ 500 ==> Loss:  1.7166,  Accuracy:  40.00
Epoch:   500/ 500 ==> Loss:  1.7039,  Accuracy:  30.00
