# **Recurrent Neural Networks**
* **Basic concepts learnt from: A Deep understanding of Deep Learning (with Python intro) - Mark X Cohen (Udemy) - https://www.udemy.com/course/deeplearning_x**
* **Extended learning and understanding by VigyannVeshi**

In [1]:
# basic deep learning libraries
import numpy as np
import torch as tr
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# explore the RNN type

### set layer params

input_size=9    # no of features to extract (e.g., number of data channels)
hidden_size=16  # no of units in the hidden state
num_layers =1   # no of vertical stacks of hidden layers (note: only the final layer)
actfun='tanh'
bias=True

### create an RNN instance
rnn=nn.RNN(input_size,hidden_size,num_layers,nonlinearity=actfun,bias=bias)
print(rnn)


RNN(9, 16)


In [3]:
### check out source code for more details of the class
??nn.RNN

[0;31mInit signature:[0m [0mnn[0m[0;34m.[0m[0mRNN[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m        
[0;32mclass[0m [0mRNN[0m[0;34m([0m[0mRNNBase[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""__init__(self,input_size,hidden_size,num_layers=1,nonlinearity='tanh',bias=True,batch_first=False,dropout=0.0,bidirectional=False,device=None,dtype=None)[0m
[0;34m[0m
[0;34m    Apply a multi-layer Elman RNN with :math:`\tanh` or :math:`\text{ReLU}`[0m
[0;34m    non-linearity to an input sequence. For each element in the input sequence,[0m
[0;34m    each layer computes the following function:[0m
[0;34m[0m
[0;34m    .. math::[0m
[0;34m        h_t = \tanh(x_t W_{ih}^T + b_{ih} + h_{t-1}W_{hh}^T + b_{hh})[0m
[0;34m[0m
[0;34m    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is[0m
[0;34m    the input at time `t`, and :math:`h_{(t-1)}` is the 

In [6]:
# set the data parameters
seqlength = 5
batchsize = 2

# create some data
X=tr.rand(seqlength,batchsize,input_size)

# create a hidden layer (typically initialized as zeros)
hidden=tr.zeros(num_layers,batchsize,hidden_size) # this are not the hidden layer weights, they are the hidden layer's states / activations

# run some data through the model and show the output sizes
y,h=rnn(X,hidden)

print(f'Input shape: {list(X.shape)}')
print(f'Hidden shape: {list(h.shape)}')
print(f'Output shape: {list(y.shape)}')

# the output we obtained is not the actual output of full RNN model, for that we need to build entire DL model of which the instance is going to be a part

Input shape: [5, 2, 9]
Hidden shape: [1, 2, 16]
Output shape: [5, 2, 16]


In [7]:
# Default hidden state is all zeros if nothing is specified:
y,h1=rnn(X,hidden)
print(h1),print('\n\n')

y,h2=rnn(X)
print(h2),print('\n\n')

# they're the same! (meaning default = zeros)
print(h1-h2)

tensor([[[ 0.0565, -0.1145,  0.3452, -0.4134, -0.0979, -0.0521,  0.4026,
          -0.1593, -0.1079, -0.3405,  0.6544,  0.4395,  0.8116,  0.0512,
          -0.3645,  0.5658],
         [ 0.1601,  0.0747,  0.4606, -0.2274,  0.1051, -0.3189,  0.5689,
          -0.3223, -0.2595, -0.2665,  0.3105,  0.2778,  0.7855,  0.4898,
          -0.4084,  0.2521]]], grad_fn=<StackBackward0>)



tensor([[[ 0.0565, -0.1145,  0.3452, -0.4134, -0.0979, -0.0521,  0.4026,
          -0.1593, -0.1079, -0.3405,  0.6544,  0.4395,  0.8116,  0.0512,
          -0.3645,  0.5658],
         [ 0.1601,  0.0747,  0.4606, -0.2274,  0.1051, -0.3189,  0.5689,
          -0.3223, -0.2595, -0.2665,  0.3105,  0.2778,  0.7855,  0.4898,
          -0.4084,  0.2521]]], grad_fn=<StackBackward0>)



tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]],
       grad_fn=<SubBackward0>)


In [9]:
# check out the learnt params and their sizes
for p in rnn.named_parameters():
    if 'weight' in p[0]:
        print(f'{p[0]} has size {list(p[1].shape)}')
    if 'bias' in p[0]:
        print(f'{p[0]} has size {list(p[1].shape)}')

weight_ih_l0 has size [16, 9]
weight_hh_l0 has size [16, 16]
bias_ih_l0 has size [16]
bias_hh_l0 has size [16]


**Building a RNN model**

In [12]:
class RNNnet(nn.Module):
    def __init__(self,input_size,num_hidden,num_layers):
        super().__init__()

        # store parameters
        self.input_size=input_size  # number of data channels / features
        self.num_hidden=num_hidden  # number of units in hidden layer
        self.num_layers=num_layers  # number of hidden layers

        # RNN layer
        self.rnn=nn.RNN(input_size,num_hidden,num_layers)

        # linear layer for output
        self.out=nn.Linear(num_hidden,1)

    def forward(self,x):

        print(f'Input: {list(x.shape)}')

        # initialize hidden state for first input
        hidden=tr.zeros(self.num_layers,batchsize,self.num_hidden)
        print(f'Hidden: {list(hidden.shape)}')

        # run through the RNN layer
        y,hidden = self.rnn(x,hidden)
        print(f'RNN-out: {list(y.shape)}')
        print(f'RNN-hidden: {list(hidden.shape)}')

        # pass the RNN output through the linear output layer
        o=self.out(y)
        print(f'Output: {list(o.shape)}')

        return o,hidden

In [13]:
# create an instance of the model and inspect 
net=RNNnet(input_size,hidden_size,num_layers)
print(net),print("")

# and check all learnable params
for p in net.named_parameters():
    print(f'{p[0]} has size {list(p[1].shape)}')

RNNnet(
  (rnn): RNN(9, 16)
  (out): Linear(in_features=16, out_features=1, bias=True)
)

rnn.weight_ih_l0 has size [16, 9]
rnn.weight_hh_l0 has size [16, 16]
rnn.bias_ih_l0 has size [16]
rnn.bias_hh_l0 has size [16]
out.weight has size [1, 16]
out.bias has size [1]


In [16]:
# test the model with some data
X=tr.rand(seqlength,batchsize,input_size)
print(X)
print()
y=tr.rand(seqlength,batchsize,1)
print(y)
print()

yHat,h=net(X)
print(yHat)
print()
print(h)
print()

# try a loss function
lossfun=nn.MSELoss()
lossfun(yHat,y)

tensor([[[0.0053, 0.5484, 0.0485, 0.5673, 0.5486, 0.3396, 0.5104, 0.8743,
          0.8748],
         [0.9904, 0.9605, 0.0279, 0.5992, 0.1111, 0.2123, 0.6276, 0.2713,
          0.9029]],

        [[0.7851, 0.5125, 0.3223, 0.7572, 0.4521, 0.6838, 0.1332, 0.8121,
          0.4930],
         [0.2134, 0.7056, 0.9400, 0.6815, 0.4034, 0.6869, 0.9908, 0.0294,
          0.8042]],

        [[0.0176, 0.0346, 0.4703, 0.7651, 0.1980, 0.4532, 0.3563, 0.5724,
          0.3856],
         [0.7774, 0.9542, 0.8925, 0.9184, 0.1487, 0.0777, 0.3813, 0.6353,
          0.5220]],

        [[0.9163, 0.1641, 0.7380, 0.0875, 0.7467, 0.8547, 0.5987, 0.8925,
          0.0074],
         [0.8996, 0.3087, 0.0327, 0.0039, 0.6696, 0.8271, 0.3799, 0.9454,
          0.2547]],

        [[0.1740, 0.6732, 0.0899, 0.3483, 0.0540, 0.5585, 0.8512, 0.9139,
          0.0817],
         [0.1723, 0.6622, 0.8708, 0.9502, 0.2611, 0.5740, 0.3063, 0.4764,
          0.1897]]])

tensor([[[0.1284],
         [0.7585]],

        [[0.1918],


tensor(0.2503, grad_fn=<MseLossBackward0>)

**Additional Exploration**

In [None]:
# 1) In the video, I asked about the "l0" from the parameter name "weight_ih_l0". To explore this further, recreate that RNN instance but set the number of layers to 3. Then go through the code again to print  out all of the weights matrices. Refer back to the discussion of layers in the previous video. Do you understand the naming system of the weights matrices?
