In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable

In [2]:
batch_size = 3
input_length = [[2,5,3], [5,3,4]]
input_dim = 2
hidden_dim = 4
num_layers = 1

# RNN vs RNNCell

|              nn.RNN              |                 nn.RNNCell                 |
|:--------------------------------:|:------------------------------------------:|
|       unrolling whole steps      |            unrolling single step           |
| internally uses CuDNN => faster! | similar to TensorFlow's RNN implementation |

## nn.RNN

* Args:
    * input_size (int)
    * hidden_size (int)
    
    
* Optional Args
    * num_layers (int)
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    * batch_first (bool)
    * dropout (float)
    * bidirectional (bool)


* inputs:
    * input (seq_len, batch_size, input_size)
    * h_0 (num_layers*num_directions, batch_size, hidden_size) => default: zeros


* outputs:
    * output (seq_len, batch_size, hidden_size* num_directions) => outputs from whole steps
    * h_n (num_layers, num_directions, batch, hidden_size) => last output

In [3]:
rnn = nn.RNN(input_dim, hidden_dim, batch_first=True)
rnn

RNN(2, 4, batch_first=True)

In [4]:
vars(rnn)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x60de6d8>,
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
                0.1513  0.2242
               -0.0615 -0.0894
                0.4656  0.0848
                0.4766 -0.4431
               [torch.FloatTensor of size 4x2]),
              ('weight_hh_l0', Parameter containing:
                0.0736  0.1527 -0.0864  0.0758
               -0.1699 -0.1154  0.4878  0.1662
               -0.2771  0.0470 -0.0406  0.1085
               -0.4232 -0.4884 -0.2893 -0.2546
               [torch.FloatTensor of size 4x4]),
              ('bias_ih_l0', Parameter containing:
               -0.2122
               -0.0692
               -0.2338
                0.3507
               [torch.FloatTensor of size 4]),
              ('bias_hh_l0', Parameter containing:
               -0.2776
               -0.0677
               -0.3371
               -0.2984
               [torch.FloatTensor of size 4])]),
 '_buffers': Or

In [13]:
# inputs: (input, h_0)
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4])


In [16]:
#outputs: (output, h_n)
#assert rnn(x, h_0) == rnn(x) # default hidden_state: zeros
rnn(x)

(Variable containing:
 (0 ,.,.) = 
  -0.0580 -0.2984 -0.7239 -0.9432
  -0.3937 -0.5754 -0.5055  0.4437
  -0.4399 -0.1965 -0.4601  0.2279
  -0.6302 -0.1277 -0.2696  0.8709
   0.2143 -0.2699  0.5924  0.2469
 
 (1 ,.,.) = 
  -0.7633  0.0701 -0.9039 -0.3039
  -0.4974 -0.4482  0.2635  0.9667
  -0.5247  0.2940 -0.0077  0.7316
  -0.3111 -0.0085  0.5353  0.8912
  -0.4104  0.2938 -0.0686  0.2933
 
 (2 ,.,.) = 
  -0.1368 -0.2716 -0.0215  0.1456
  -0.5410 -0.0370 -0.7020 -0.2264
  -0.7699 -0.2063 -0.7086  0.8003
  -0.1891 -0.2924  0.1094  0.4245
  -0.5353  0.0812 -0.4308  0.3676
 [torch.FloatTensor of size 3x5x4], Variable containing:
 (0 ,.,.) = 
   0.2143 -0.2699  0.5924  0.2469
  -0.4104  0.2938 -0.0686  0.2933
  -0.5353  0.0812 -0.4308  0.3676
 [torch.FloatTensor of size 1x3x4])

## nn.LSTM

* Args:
    Same as RNN


* inputs:
    * input (seq_len, batch_size, input_size)
    * (h_0, c_0) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )
    

* outputs:
    * output (seq_len, batch_size, hidden_size*num_directions)
    * (h_n, c_n) ( (num_layers*num_directions, batch_size, hidden_size), (num_layers*num_directions, batch_size, hidden_size) )

In [7]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstm

LSTM(2, 4, batch_first=True)

In [8]:
vars(lstm)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x60de6d8>,
 '_parameters': OrderedDict([('weight_ih_l0', Parameter containing:
               -0.3700  0.0155
                0.2055  0.3116
                0.4719 -0.0947
               -0.4622  0.0939
                0.2665 -0.2940
               -0.4777 -0.2390
               -0.1258  0.0485
               -0.2348 -0.0667
               -0.0078 -0.3306
                0.4739 -0.0022
                0.4337  0.0581
               -0.0034 -0.3160
                0.4169 -0.0563
                0.1696  0.2869
                0.0442  0.4298
               -0.0640 -0.3694
               [torch.FloatTensor of size 16x2]),
              ('weight_hh_l0', Parameter containing:
                0.2901 -0.4752  0.3359 -0.0818
                0.2891  0.1570  0.2586 -0.2904
               -0.1311 -0.3719  0.2260 -0.3754
                0.0380  0.1057 -0.3771 -0.0485
               -0.4337  0.2659  0.0312  0.1687
               -0.2301 -0.

In [9]:
# inputs: (input, (h_0, c_0))
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))
h_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
c_0 = Variable(torch.zeros([num_layers, batch_size, hidden_dim]))
print(x.size(), h_0.size(), c_0.size())

torch.Size([3, 5, 2]) torch.Size([1, 3, 4]) torch.Size([1, 3, 4])


In [12]:
# outputs: (output, (h_n, c_n))
#assert (lstm(x, (h_0, c_0)) == lstm(x)) # default hidden_state: zeros
lstm(x)

(Variable containing:
 (0 ,.,.) = 
  -0.0787 -0.1039 -0.0809 -0.1441
  -0.1235 -0.1224 -0.1028 -0.1770
  -0.2277  0.1088  0.1591 -0.1730
  -0.3193  0.2741  0.3421 -0.1345
  -0.2500  0.1349  0.1999 -0.0978
 
 (1 ,.,.) = 
  -0.0917  0.0111  0.0637 -0.0908
  -0.1116  0.1216  0.1719 -0.0053
  -0.1650  0.1685  0.2994 -0.0833
  -0.2222  0.1393  0.3304 -0.1721
  -0.2460  0.0905  0.2836 -0.2194
 
 (2 ,.,.) = 
   0.0726  0.0544  0.0793  0.1321
   0.0303  0.0765  0.1467  0.0973
  -0.1188  0.0769  0.2721 -0.1178
  -0.2039  0.0764  0.2692 -0.2036
  -0.2039 -0.0323  0.2360 -0.3087
 [torch.FloatTensor of size 3x5x4], (Variable containing:
  (0 ,.,.) = 
   -0.2500  0.1349  0.1999 -0.0978
   -0.2460  0.0905  0.2836 -0.2194
   -0.2039 -0.0323  0.2360 -0.3087
  [torch.FloatTensor of size 1x3x4], Variable containing:
  (0 ,.,.) = 
   -0.4636  0.4878  0.9469 -0.1271
   -0.5572  0.2467  0.8449 -0.3372
   -0.6386 -0.0783  0.4731 -0.5787
  [torch.FloatTensor of size 1x3x4]))

## nn.RNNCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * nonlinearity (activation function; ex. nn.Tanh, nn.ReLU)
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * hidden (batch_size, hidden_size)


* outputs:
    * h (batch, hidden_size) => current output

In [11]:
rnncell = nn.RNNCell(input_dim, hidden_dim)
rnncell

RNNCell(2, 4)

In [12]:
vars(rnncell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x10cce1358>,
 '_backward_hooks': OrderedDict(),
 '_buffers': OrderedDict(),
 '_forward_hooks': OrderedDict(),
 '_modules': OrderedDict(),
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.0462 -0.4683
                0.3904  0.3451
                0.4196  0.3587
               -0.0410  0.4752
               [torch.FloatTensor of size 4x2]),
              ('weight_hh', Parameter containing:
               -0.4353  0.3403  0.3750  0.2516
                0.3241 -0.2271 -0.0170  0.1571
               -0.0019 -0.2814 -0.0746  0.0587
               -0.1815 -0.0939  0.2721 -0.3193
               [torch.FloatTensor of size 4x4]),
              ('bias_ih', Parameter containing:
                0.4209
                0.0529
               -0.2916
                0.2168
               [torch.FloatTensor of size 4]),
              ('bias_hh', Parameter containing:
                0.1012
               -0.

In [13]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h = rnncell(x[:, i, :], h)
    outputs.append(h)

outputs

[Variable containing:
  0.4928  0.4206 -0.2249  0.2194
  0.5528 -0.5719 -0.8863  0.1242
  0.9087 -0.5160 -0.8612 -0.6502
 [torch.FloatTensor of size 3x4], Variable containing:
 -0.6993  0.8970  0.5518  0.8582
  0.3512 -0.7760 -0.9596 -0.6016
 -0.3001  0.2738 -0.5236 -0.1354
 [torch.FloatTensor of size 3x4], Variable containing:
  0.8768  0.1437 -0.4419  0.3393
  0.0924  0.1465 -0.4722 -0.3160
  0.6768 -0.3463 -0.7402 -0.1467
 [torch.FloatTensor of size 3x4], Variable containing:
 -0.6900  0.7996  0.1189  0.6671
  0.4396  0.2118 -0.4229 -0.0176
 -0.7075  0.3605 -0.4519  0.5858
 [torch.FloatTensor of size 3x4], Variable containing:
  0.8538 -0.1093 -0.6029  0.1427
  0.0054 -0.0776 -0.7328  0.2528
  0.3136  0.0215 -0.4674  0.5588
 [torch.FloatTensor of size 3x4]]

## nn.LSTMCell

* Args:
    * input_size (int)
    * hidden_size (int)


* Optional Args
    * bias (bool)
    
    
* inputs:
    * input (batch_size, input_size)
    * (h_0, c_0) ( (batch_size, hidden_size), (batch_size, hidden_size) )


* outputs:
    * (h_1, c_1) ( (batch_size, hidden_size), (batch_size, hidden_size) ) => current output

In [17]:
lstmcell = nn.LSTMCell(input_dim, hidden_dim)
lstmcell

LSTMCell(2, 4)

In [22]:
vars(lstmcell)

{'_backend': <torch.nn.backends.thnn.THNNFunctionBackend at 0x60de6d8>,
 '_parameters': OrderedDict([('weight_ih', Parameter containing:
                0.2813  0.1976
                0.4892 -0.3305
               -0.0484 -0.4095
                0.3371  0.4489
                0.3281  0.0793
                0.4522 -0.4446
                0.1620  0.4201
               -0.1592  0.1441
                0.3379 -0.0239
               -0.2831  0.2630
                0.4839 -0.0566
               -0.0617  0.1634
                0.0112  0.1042
               -0.0808 -0.3628
                0.0280 -0.4231
               -0.1648  0.3465
               [torch.FloatTensor of size 16x2]),
              ('weight_hh', Parameter containing:
               -0.1062  0.4362  0.1465  0.1387
               -0.1221  0.4445  0.4812  0.4958
               -0.4253 -0.0245  0.3111 -0.1871
                0.3447 -0.0616 -0.0035 -0.3534
               -0.3448 -0.4124 -0.3245  0.3042
                0.1346  0.3399 -

In [16]:
# input
x = Variable(torch.randn([batch_size, max(input_length[0]), input_dim]))

# initial hidden
h = Variable(torch.zeros(batch_size, hidden_dim))
c = Variable(torch.zeros(batch_size, hidden_dim))

outputs = []
for i in range(max(input_length[0])):
    h, c = lstmcell(x[:, i, :], (h, c))
    outputs.append((h, c))

outputs

[(Variable containing:
   0.0239  0.0653 -0.0766 -0.1305
  -0.0394 -0.0399 -0.1233 -0.2497
   0.0266  0.0803 -0.0728 -0.1402
  [torch.FloatTensor of size 3x4], Variable containing:
   0.0717  0.1722 -0.1206 -0.2100
  -0.0871 -0.1005 -0.2579 -0.3753
   0.0849  0.2139 -0.1184 -0.2210
  [torch.FloatTensor of size 3x4]), (Variable containing:
  -0.1323 -0.1289 -0.1874 -0.2948
  -0.0244 -0.0166 -0.1732 -0.3553
  -0.0934 -0.0941 -0.1721 -0.1685
  [torch.FloatTensor of size 3x4], Variable containing:
  -0.2126 -0.3268 -0.3410 -0.5166
  -0.0610 -0.0423 -0.3720 -0.5718
  -0.1590 -0.2392 -0.2537 -0.3164
  [torch.FloatTensor of size 3x4]), (Variable containing:
  -0.0877 -0.1197 -0.2127 -0.3029
   0.0088  0.0429 -0.1944 -0.3726
  -0.1382 -0.1583 -0.2033 -0.3050
  [torch.FloatTensor of size 3x4], Variable containing:
  -0.1927 -0.3154 -0.3559 -0.5410
   0.0256  0.1121 -0.4053 -0.6216
  -0.2564 -0.4212 -0.3663 -0.5253
  [torch.FloatTensor of size 3x4]), (Variable containing:
   0.0350  0.0593 -0.20

# (Naive) Speed Comparison between LSTM vs LSTMCell

In [23]:
from time import time

In [24]:
batch_size = 30
input_dim = 40
hidden_dim = 40
max_seq_len = 50
num_layers = 1

n_epochs = 1000

In [25]:
lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
lstmcell = lstmcell = nn.LSTMCell(input_dim, hidden_dim)

## Feeding predefined inputs
### - used in RNN Encoder

### LSTM

In [26]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))
h_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
c_0 = Variable(torch.zeros(num_layers, batch_size, hidden_dim))

start = time()

for _ in range(n_epochs):
    outputs_ = lstm(x, (h_0, c_0))

print(time() - start)

10.429596662521362


### LSTMCell

In [27]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))

    for i in range(max_seq_len):
        h, c = lstmcell(x[:, i, :], (h, c))

print(time() - start)

10.514601469039917


## Feeding output from last step
### - used in RNN Decoder

### LSTM

In [28]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    c = Variable(torch.zeros(num_layers, batch_size, hidden_dim))
    input = x[:, 0:1, :]
    
    for i in range(max_seq_len):
        output, (h, c) = lstm(input, (h, c))
        input = output
        
print(time() - start)

16.514944791793823


In [29]:
output

Variable containing:
(0 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(1 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(2 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053
...

(27,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(28,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(29,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053
[torch.FloatTensor of size 30x1x40]

### LSTMCell

In [30]:
%timeit
x = Variable(torch.randn([batch_size, max_seq_len, input_dim]))

start = time()

for _ in range(n_epochs):
    
    h = Variable(torch.zeros(batch_size, hidden_dim))
    c = Variable(torch.zeros(batch_size, hidden_dim))
    input = x[:, 0, :]
    
    for i in range(max_seq_len):
        h, c = lstmcell(input, (h, c))
        input = input # usually argmax is used

print(time() - start)

10.070575952529907


In [31]:
output

Variable containing:
(0 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(1 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(2 ,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053
...

(27,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(28,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053

(29,.,.) = 
  0.1053 -0.1296 -0.0428  ...   0.0479  0.0860 -0.1053
[torch.FloatTensor of size 30x1x40]

# Rule of thumbs
## - Use RNN when Encoding
## - Use RNNCells when decoding + (complex architecture)