# **LSTMs and GRUs in Pytorch**

* **Basic concepts learnt from: A Deep understanding of Deep Learning (with Python intro) - Mark X Cohen (Udemy) - https://www.udemy.com/course/deeplearning_x**
* **Extended learning and understanding by VigyannVeshi**

In [1]:
# basic deep learning libraries
import numpy as np
import torch as tr
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# explore the RNN type

### set layer params

input_size=9    # no of features to extract (e.g., number of data channels)
hidden_size=16  # no of units in the hidden state
num_layers =2   # no of vertical stacks of hidden layers (note: only the final layer)

### create an LSTM instance
lstm=nn.LSTM(input_size,hidden_size,num_layers)
lstm


LSTM(9, 16, num_layers=2)

In [3]:
### check out source code for more details of the class
??nn.LSTM

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mLSTM[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mLSTM[0m[0;34m([0m[0mRNNBase[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""__init__(self,input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,proj_size=0,device=None,dtype=None)[0m
[0;34m[0m
[0;34m    Apply a multi-layer long short-term memory (LSTM) RNN to an input sequence.[0m
[0;34m    For each element in the input sequence, each layer computes the following[0m
[0;34m    function:[0m
[0;34m[0m
[0;34m    .. math::[0m
[0;34m        \begin{array}{ll} \\[0m
[0;34m            i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{t-1} + b_{hi}) \\[0m
[0;34m            f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{t-1} + b_{hf}) \\[0m
[0;34m            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{

In [4]:
# set the data parameters
seqlength = 5
batchsize = 2

# create some data
X=tr.rand(seqlength,batchsize,input_size)

# create a hidden layer (typically initialized as zeros)
H=tr.zeros(num_layers,batchsize,hidden_size) # this are not the hidden layer weights, they are the hidden layer's states / activations
C=tr.zeros(num_layers,batchsize,hidden_size)

hidden_inputs=(H,C)

# run some data through the model and show the output sizes
y,h=lstm(X,hidden_inputs)

print(f'Input shape: {list(X.shape)}')
print(f'Hidden shape: {list(h[0].shape)}')
print(f'Cell shape: {list(h[1].shape)}')
print(f'Output shape: {list(y.shape)}')

Input shape: [5, 2, 9]
Hidden shape: [2, 2, 16]
Cell shape: [2, 2, 16]
Output shape: [5, 2, 16]


In [5]:
# check out the learnt params and their sizes
for p in lstm.named_parameters():
    if 'weight' in p[0]:
        print(f'{p[0]} has size {list(p[1].shape)}')
    if 'bias' in p[0]:
        print(f'{p[0]} has size {list(p[1].shape)}')

weight_ih_l0 has size [64, 9]
weight_hh_l0 has size [64, 16]
bias_ih_l0 has size [64]
bias_hh_l0 has size [64]
weight_ih_l1 has size [64, 16]
weight_hh_l1 has size [64, 16]
bias_ih_l1 has size [64]
bias_hh_l1 has size [64]


**Building a LSTM model**

In [6]:
class LSTMnet(nn.Module):
    def __init__(self,input_size,num_hidden,num_layers):
        super().__init__()

        # store parameters
        self.input_size=input_size  # number of data channels / features
        self.num_hidden=num_hidden  # number of units in hidden layer
        self.num_layers=num_layers  # number of hidden layers

        # LSTM layer
        self.lstm=nn.LSTM(input_size,num_hidden,num_layers)

        # linear layer for output
        self.out=nn.Linear(num_hidden,1)

    def forward(self,x):

        print(f'Input: {list(x.shape)}')

        # run through the RNN layer
        y,hidden = self.lstm(x)
        print(f'LSTM-out: {list(y.shape)}')
        print(f'LSTM-hidden: {list(hidden[0].shape)}')
        print(f'LSTM-hidden: {list(hidden[1].shape)}')

        # pass the RNN output through the linear output layer
        o=self.out(y)
        print(f'Output: {list(o.shape)}')

        return o,hidden

In [8]:
# create an instance of the model and inspect 
net=LSTMnet(input_size,hidden_size,num_layers)
print(net),print("")

# and check all learnable params
for p in net.named_parameters():
    print(f'{p[0]:>20} has size {list(p[1].shape)}')

LSTMnet(
  (lstm): LSTM(9, 16, num_layers=2)
  (out): Linear(in_features=16, out_features=1, bias=True)
)

   lstm.weight_ih_l0 has size [64, 9]
   lstm.weight_hh_l0 has size [64, 16]
     lstm.bias_ih_l0 has size [64]
     lstm.bias_hh_l0 has size [64]
   lstm.weight_ih_l1 has size [64, 16]
   lstm.weight_hh_l1 has size [64, 16]
     lstm.bias_ih_l1 has size [64]
     lstm.bias_hh_l1 has size [64]
          out.weight has size [1, 16]
            out.bias has size [1]


In [9]:
# test the model with some data
X=tr.rand(seqlength,batchsize,input_size)
print(X)
print()
y=tr.rand(seqlength,batchsize,1)
print(y)
print()

yHat,h=net(X)
print(yHat)
print()
print(h)
print()

# try a loss function
lossfun=nn.MSELoss()
lossfun(yHat,y)

tensor([[[0.7728, 0.1783, 0.9697, 0.1222, 0.6690, 0.7672, 0.5805, 0.0600,
          0.3308],
         [0.1062, 0.3803, 0.7148, 0.2098, 0.6911, 0.5275, 0.3172, 0.0371,
          0.3041]],

        [[0.5736, 0.7865, 0.5321, 0.3790, 0.2082, 0.5755, 0.4707, 0.0667,
          0.0910],
         [0.8389, 0.4066, 0.1717, 0.7268, 0.1750, 0.2621, 0.2374, 0.9550,
          0.7339]],

        [[0.8616, 0.2245, 0.3379, 0.4236, 0.9852, 0.8124, 0.1602, 0.1451,
          0.9386],
         [0.2925, 0.9569, 0.2452, 0.7065, 0.8578, 0.1131, 0.1126, 0.4191,
          0.4283]],

        [[0.2498, 0.9323, 0.2804, 0.2432, 0.0112, 0.0849, 0.8196, 0.8214,
          0.1087],
         [0.6805, 0.1647, 0.5082, 0.4830, 0.4888, 0.1602, 0.8022, 0.7925,
          0.5530]],

        [[0.3855, 0.1124, 0.1403, 0.6519, 0.5294, 0.3180, 0.5657, 0.7867,
          0.5968],
         [0.7033, 0.1861, 0.9755, 0.9050, 0.3899, 0.2862, 0.4403, 0.4577,
          0.9969]]])

tensor([[[0.8678],
         [0.0047]],

        [[0.1102],


tensor(0.2936, grad_fn=<MseLossBackward0>)

**Gated Recurrent Unit (GRU)**

In [10]:
# create a GRU instance
gru = nn.GRU(input_size,hidden_size,num_layers)
gru

GRU(9, 16, num_layers=2)

In [11]:
??nn.GRU

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mGRU[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mGRU[0m[0;34m([0m[0mRNNBase[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""__init__(self,input_size,hidden_size,num_layers=1,bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)[0m
[0;34m[0m
[0;34m    Apply a multi-layer gated recurrent unit (GRU) RNN to an input sequence.[0m
[0;34m    For each element in the input sequence, each layer computes the following[0m
[0;34m    function:[0m
[0;34m[0m
[0;34m    .. math::[0m
[0;34m        \begin{array}{ll}[0m
[0;34m            r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\[0m
[0;34m            z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\[0m
[0;34m            n_t = \tanh(W_{in} x_t + b_{in} + r_t \odot (W_{hn} h_{(t-1)

In [13]:
# create some data and a hidden state
X = tr.rand(seqlength,batchsize,input_size)
H = tr.zeros(num_layers,batchsize,hidden_size)

# run some data through the model and show the output sizes
y,h = gru(X,H) # No cell states in GRU!
print(f' Input shape: {list(X.shape)}')
print(f'Hidden shape: {list(h.shape)}')
print(f'Output shape: {list(y.shape)}')

 Input shape: [5, 2, 9]
Hidden shape: [2, 2, 16]
Output shape: [5, 2, 16]


In [14]:
# Check out the learned parameters and their sizes
for p in gru.named_parameters():
  print(f'{p[0]:>15} has size {list(p[1].shape)}')

   weight_ih_l0 has size [48, 9]
   weight_hh_l0 has size [48, 16]
     bias_ih_l0 has size [48]
     bias_hh_l0 has size [48]
   weight_ih_l1 has size [48, 16]
   weight_hh_l1 has size [48, 16]
     bias_ih_l1 has size [48]
     bias_hh_l1 has size [48]
