In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# LSTM

![LSTM](fig/LSTM.jpg)

Math formulas:  
$$
\begin{aligned}
i_t &= \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t &= \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t &= \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
o_t &= \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t &= f_t * c_{(t-1)} + i_t * g_t \\
h_t &= o_t * \tanh(c_t)
\end{aligned}
$$

LSTM expects all of its inputs to be 3D tensors:  
* 1st dimension: sequence (along words in a sentence)  
* 2nd dimension: mini-batch  
* 3rd dimension: elements (embedding vector)  

In [2]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
lstm = nn.LSTM(3, 2)

# (W_ii|W_if|W_ig|W_io)
print(lstm.weight_ih_l0)
# (W_hi|W_hf|W_hg|W_ho)
print(lstm.weight_hh_l0)
# (b_ii|b_if|b_ig|b_io)
print(lstm.bias_ih_l0)
# (b_ii|b_if|b_ig|b_io)
print(lstm.bias_hh_l0)

Parameter containing:
tensor([[-0.0053,  0.3793, -0.5820],
        [-0.5204, -0.2723,  0.1896],
        [-0.0140,  0.5607, -0.0628],
        [ 0.1871, -0.2137, -0.1390],
        [-0.6755, -0.4683, -0.2915],
        [ 0.0262,  0.2795,  0.4243],
        [-0.4794, -0.3079,  0.2568],
        [ 0.5872, -0.1455,  0.5291]], requires_grad=True)
Parameter containing:
tensor([[-0.1140,  0.0748],
        [ 0.6403, -0.6560],
        [-0.4452, -0.1790],
        [-0.2756,  0.6109],
        [-0.4583, -0.3255],
        [-0.4940, -0.6622],
        [-0.4128,  0.6078],
        [ 0.3155,  0.3427]], requires_grad=True)
Parameter containing:
tensor([ 0.0372, -0.3625,  0.1196, -0.6602, -0.5109, -0.3645,  0.4461,  0.4146],
       requires_grad=True)
Parameter containing:
tensor([-0.3136, -0.0255,  0.4522,  0.7030,  0.2806,  0.0955,  0.4741, -0.4163],
       requires_grad=True)


In [3]:
# 1 seq length, 1 batch size, 3 embedding dimensions. 
in1 = torch.randn(1, 1, 3)
print(in1)

# Initialize the hidden state, which is (h_0, c_0) in LSTM structure picture. 
hidden0 = (torch.randn(1, 1, 2),
           torch.randn(1, 1, 2))

# Pass throught the LSTM cell. 
out1, hidden1 = lstm(in1, hidden0)
print(out1)
print(hidden1[0])
print(hidden1[1])

tensor([[[-0.0209, -0.7185,  0.5186]]])
tensor([[[ 0.4164, -0.4423]]], grad_fn=<StackBackward>)
tensor([[[ 0.4164, -0.4423]]], grad_fn=<StackBackward>)
tensor([[[ 0.5179, -1.3637]]], grad_fn=<StackBackward>)


In [4]:
# Manually calculate the LSTM
# Shape of (1, 2*4)
W_ih_xb = in1[0].mm(lstm.weight_ih_l0.T) + lstm.bias_ih_l0
W_hh_xb = hidden0[0][0].mm(lstm.weight_hh_l0.T) + lstm.bias_hh_l0

# Shape of (1, 2)
i1 = F.sigmoid((W_ih_xb + W_hh_xb)[:, 0:2])
f1 = F.sigmoid((W_ih_xb + W_hh_xb)[:, 2:4])
g1 = F.tanh((W_ih_xb + W_hh_xb)[:, 4:6])
o1 = F.sigmoid((W_ih_xb + W_hh_xb)[:, 6:8])
c1 = f1 * hidden0[1][0] + i1 * g1
h1 = o1 * F.tanh(c1)

print(h1)
print(c1)

tensor([[ 0.4164, -0.4423]], grad_fn=<MulBackward0>)
tensor([[ 0.5179, -1.3637]], grad_fn=<AddBackward0>)


## LSTM: Step the Sequence One Element at a Time

In [5]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
lstm = nn.LSTM(3, 2)

# 5 seq length, 1 batch size, 3 embedding dimensions. 
ins = torch.randn(5, 1, 3)

# Initialize the hidden state, which is (h_0, c_0) in LSTM structure picture. 
hidden = (torch.randn(1, 1, 2),
          torch.randn(1, 1, 2))

# Step through the sequence one element at a time.
for ins_i in ins:
    out, hidden = lstm(ins_i.view(1, 1, -1), hidden)
    # out is always the first of two elements of hidden
    # hidden is (h_t, c_t) in LSTM structure picture. 
    print(out)
    print(hidden)

tensor([[[0.0083, 0.1808]]], grad_fn=<StackBackward>)
(tensor([[[0.0083, 0.1808]]], grad_fn=<StackBackward>), tensor([[[0.0114, 0.4527]]], grad_fn=<StackBackward>))
tensor([[[0.1124, 0.0478]]], grad_fn=<StackBackward>)
(tensor([[[0.1124, 0.0478]]], grad_fn=<StackBackward>), tensor([[[0.1329, 0.1232]]], grad_fn=<StackBackward>))
tensor([[[ 0.4619, -0.0474]]], grad_fn=<StackBackward>)
(tensor([[[ 0.4619, -0.0474]]], grad_fn=<StackBackward>), tensor([[[ 0.6229, -0.3593]]], grad_fn=<StackBackward>))
tensor([[[ 0.2640, -0.1513]]], grad_fn=<StackBackward>)
(tensor([[[ 0.2640, -0.1513]]], grad_fn=<StackBackward>), tensor([[[ 0.4306, -0.4890]]], grad_fn=<StackBackward>))
tensor([[[ 0.0712, -0.1522]]], grad_fn=<StackBackward>)
(tensor([[[ 0.0712, -0.1522]]], grad_fn=<StackBackward>), tensor([[[ 0.1088, -0.3212]]], grad_fn=<StackBackward>))


## LSTM: Step the Sequence All at Once

In [6]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
lstm = nn.LSTM(3, 2)

# 5 seq length, 1 batch size, 3 embedding dimensions. 
ins = torch.randn(5, 1, 3)

# Initialize the hidden state, which is (h_0, c_0) in LSTM structure picture. 
hidden = (torch.randn(1, 1, 2),
          torch.randn(1, 1, 2))

# do the entire sequence all at once.
outs, hidden = lstm(ins, hidden)
print(outs)
print(hidden)

tensor([[[ 0.0083,  0.1808]],

        [[ 0.1124,  0.0478]],

        [[ 0.4619, -0.0474]],

        [[ 0.2640, -0.1513]],

        [[ 0.0712, -0.1522]]], grad_fn=<StackBackward>)
(tensor([[[ 0.0712, -0.1522]]], grad_fn=<StackBackward>), tensor([[[ 0.1088, -0.3212]]], grad_fn=<StackBackward>))


In [7]:
# Use the output-sequence as the input-sequence of the next LSTM-layer
# Use the last hidden state as the initial hidden state of the next LSTM-layer

lstm_next = nn.LSTM(2, 2)

outs, hidden = lstm_next(outs, hidden)
print(outs)
print(hidden)

tensor([[[ 0.1108, -0.0602]],

        [[ 0.1081,  0.0063]],

        [[ 0.0357,  0.0238]],

        [[ 0.0341,  0.0623]],

        [[ 0.0752,  0.1061]]], grad_fn=<StackBackward>)
(tensor([[[0.0752, 0.1061]]], grad_fn=<StackBackward>), tensor([[[0.1004, 0.2453]]], grad_fn=<StackBackward>))


## LSTM: Batched Input and Output of LSTM Layer

In [8]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
lstm = nn.LSTM(3, 2)

# Batch input & output
# 5 seq length, 16 batch size, 3 embedding dimensions. 
ins = torch.randn(5, 16, 3)

# The initial hidden state should be the same across mini-batches
hidden = (torch.zeros(1, 16, 2),
          torch.zeros(1, 16, 2))

outs, hidden = lstm(ins, hidden)
print(outs.size())
print(hidden[0].size(), hidden[1].size())

torch.Size([5, 16, 2])
torch.Size([1, 16, 2]) torch.Size([1, 16, 2])


## LSTM: Batch-First Input and Output

In [9]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
lstm = nn.LSTM(3, 2, batch_first=True)

# Batch input & output
# 5 seq length, 16 batch size, 3 embedding dimensions. 
ins = torch.randn(16, 5, 3)

# The initial hidden state should be the same across mini-batches
# NOTE: the batch size is still in the second place.
hidden = (torch.zeros(1, 16, 2),
          torch.zeros(1, 16, 2))

outs, hidden = lstm(ins, hidden)
print(outs.size())
print(hidden[0].size(), hidden[1].size())

torch.Size([16, 5, 2])
torch.Size([1, 16, 2]) torch.Size([1, 16, 2])


## LSTM: Packed (Masked) Input and Output of LSTM Layer

In [10]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
# Use batch-first to show the input and output more clearly. 
lstm = nn.LSTM(3, 2, batch_first=True)

# Masked input & output
# 5 seq length, 4 batch size, 3 embedding dimensions. 
ins = torch.randn(4, 5, 3)
ins

tensor([[[ 0.2055, -0.4503, -0.5731],
         [-0.5554,  0.5943,  1.5419],
         [ 0.5073, -0.5910, -1.3253],
         [ 0.1886, -0.0691, -0.4949],
         [-1.4959, -0.1938,  0.4455]],

        [[ 1.3253,  1.5091,  2.0820],
         [ 1.7067,  2.3804, -1.1256],
         [-0.3170, -1.0925, -0.0852],
         [ 0.3276, -0.7607, -1.5991],
         [ 0.0185, -0.7504,  0.1854]],

        [[ 0.6211,  0.6382, -0.0033],
         [-0.5344,  1.1687,  0.3945],
         [ 1.9415,  0.7915, -0.0203],
         [-0.4372, -1.5353, -0.4127],
         [ 0.9663,  1.6248,  0.9625]],

        [[ 0.3492, -0.9215, -0.0562],
         [-0.7015,  1.0367, -0.6037],
         [-1.2788,  0.1239,  1.1648],
         [ 0.9234,  1.3873,  1.3750],
         [ 0.6596,  0.4766, -1.0163]]])

In [11]:
# The sequences have to be sorted in a decreasing order according to their lengths. 
# The four sentences have 5, 5, 4, 2 valid words, respectively. 
packed_ins = pack_padded_sequence(ins, lengths=[5, 5, 4, 2], batch_first=True)
packed_ins

PackedSequence(data=tensor([[ 0.2055, -0.4503, -0.5731],
        [ 1.3253,  1.5091,  2.0820],
        [ 0.6211,  0.6382, -0.0033],
        [ 0.3492, -0.9215, -0.0562],
        [-0.5554,  0.5943,  1.5419],
        [ 1.7067,  2.3804, -1.1256],
        [-0.5344,  1.1687,  0.3945],
        [-0.7015,  1.0367, -0.6037],
        [ 0.5073, -0.5910, -1.3253],
        [-0.3170, -1.0925, -0.0852],
        [ 1.9415,  0.7915, -0.0203],
        [ 0.1886, -0.0691, -0.4949],
        [ 0.3276, -0.7607, -1.5991],
        [-0.4372, -1.5353, -0.4127],
        [-1.4959, -0.1938,  0.4455],
        [ 0.0185, -0.7504,  0.1854]]), batch_sizes=tensor([4, 4, 3, 3, 2]), sorted_indices=None, unsorted_indices=None)

In [12]:
# The values outside the specified valid lengths in original input are lost. 
pad_packed_sequence(packed_ins, batch_first=True)

(tensor([[[ 0.2055, -0.4503, -0.5731],
          [-0.5554,  0.5943,  1.5419],
          [ 0.5073, -0.5910, -1.3253],
          [ 0.1886, -0.0691, -0.4949],
          [-1.4959, -0.1938,  0.4455]],
 
         [[ 1.3253,  1.5091,  2.0820],
          [ 1.7067,  2.3804, -1.1256],
          [-0.3170, -1.0925, -0.0852],
          [ 0.3276, -0.7607, -1.5991],
          [ 0.0185, -0.7504,  0.1854]],
 
         [[ 0.6211,  0.6382, -0.0033],
          [-0.5344,  1.1687,  0.3945],
          [ 1.9415,  0.7915, -0.0203],
          [-0.4372, -1.5353, -0.4127],
          [ 0.0000,  0.0000,  0.0000]],
 
         [[ 0.3492, -0.9215, -0.0562],
          [-0.7015,  1.0367, -0.6037],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000]]]),
 tensor([5, 5, 4, 2]))

In [13]:
# Pass through the LSTM
# NOTE: the batch size is still in the second place.
hidden = (torch.zeros(1, 4, 2),
          torch.zeros(1, 4, 2))

packed_outs, hidden = lstm(packed_ins, hidden)
packed_outs

PackedSequence(data=tensor([[ 0.0029, -0.0991],
        [-0.1605,  0.1643],
        [-0.2102, -0.0125],
        [-0.0048, -0.1174],
        [-0.1077,  0.1164],
        [-0.2453,  0.0519],
        [-0.3288,  0.0454],
        [-0.0139, -0.0448],
        [-0.0145, -0.0406],
        [-0.2614, -0.0780],
        [-0.3299,  0.0585],
        [-0.0623, -0.0949],
        [ 0.0572, -0.1128],
        [-0.1017, -0.1055],
        [ 0.1499, -0.0470],
        [ 0.0512, -0.1822]], grad_fn=<CatBackward>), batch_sizes=tensor([4, 4, 3, 3, 2]), sorted_indices=None, unsorted_indices=None)

In [14]:
outs, lengths = pad_packed_sequence(packed_outs, batch_first=True)
print(outs)
print(lengths)

tensor([[[ 0.0029, -0.0991],
         [-0.1077,  0.1164],
         [-0.0145, -0.0406],
         [-0.0623, -0.0949],
         [ 0.1499, -0.0470]],

        [[-0.1605,  0.1643],
         [-0.2453,  0.0519],
         [-0.2614, -0.0780],
         [ 0.0572, -0.1128],
         [ 0.0512, -0.1822]],

        [[-0.2102, -0.0125],
         [-0.3288,  0.0454],
         [-0.3299,  0.0585],
         [-0.1017, -0.1055],
         [ 0.0000,  0.0000]],

        [[-0.0048, -0.1174],
         [-0.0139, -0.0448],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0000,  0.0000]]], grad_fn=<TransposeBackward0>)
tensor([5, 5, 4, 2])


In [15]:
hidden

(tensor([[[ 0.1499, -0.0470],
          [ 0.0512, -0.1822],
          [-0.1017, -0.1055],
          [-0.0139, -0.0448]]], grad_fn=<StackBackward>),
 tensor([[[ 0.1771, -0.1396],
          [ 0.0684, -0.3450],
          [-0.1215, -0.2590],
          [-0.0207, -0.1593]]], grad_fn=<StackBackward>))

## LSTM: Bidirectional

In [16]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
bi_lstm = nn.LSTM(3, 2, batch_first=True, bidirectional=True)

# Masked input & output
# 5 seq length, 4 batch size, 3 embedding dimensions. 
ins = torch.randn(4, 5, 3)

# The sequences have to be sorted in a decreasing order according to their lengths. 
# The four sentences have 5, 5, 4, 2 valid words, respectively. 
packed_ins = pack_padded_sequence(ins, lengths=[5, 5, 4, 2], batch_first=True)
packed_ins

PackedSequence(data=tensor([[-0.2188, -2.4351, -0.0729],
        [ 1.3873, -0.8834, -0.4189],
        [-0.3126,  0.2458, -0.2596],
        [-1.1964,  0.1970, -1.1773],
        [-0.0340,  0.9625,  0.3492],
        [-0.8048,  0.5656,  0.6104],
        [ 0.1183,  0.2440,  1.1646],
        [-0.0661, -0.3584, -1.5616],
        [-0.9215, -0.0562, -0.6227],
        [ 0.4669,  1.9507, -1.0631],
        [ 0.2886,  0.3866, -0.2011],
        [-0.4637,  1.9218, -0.4025],
        [-0.0773,  0.1164, -0.5940],
        [-0.1179,  0.1922, -0.7722],
        [ 0.1239,  1.1648,  0.9234],
        [-1.2439, -0.1021, -1.0335]]), batch_sizes=tensor([4, 4, 3, 3, 2]), sorted_indices=None, unsorted_indices=None)

In [17]:
# Pass through the LSTM
# NOTE: the batch size is still in the second place.
hidden = (torch.zeros(2, 4, 2),
          torch.zeros(2, 4, 2))

packed_outs, hidden = bi_lstm(packed_ins, hidden)
packed_outs

PackedSequence(data=tensor([[ 0.1599, -0.2296, -0.0549,  0.0859],
        [-0.1351, -0.1073,  0.1882,  0.0390],
        [-0.0206, -0.0524,  0.1111,  0.2702],
        [ 0.3011, -0.0614,  0.0943,  0.0871],
        [-0.0967, -0.0438,  0.1588,  0.2738],
        [-0.1369,  0.0257,  0.0899,  0.3400],
        [-0.1503,  0.0542,  0.0921,  0.2098],
        [ 0.2555, -0.1526,  0.1258, -0.1127],
        [ 0.1514, -0.0751,  0.0661,  0.2950],
        [-0.3010,  0.0029,  0.2636,  0.1700],
        [-0.2395, -0.0082,  0.1821,  0.1473],
        [-0.1441, -0.0259,  0.1622,  0.3950],
        [-0.3154, -0.0406,  0.1405,  0.1216],
        [-0.1410, -0.0510,  0.1272,  0.0702],
        [-0.3265,  0.0767,  0.0860,  0.2457],
        [ 0.1149, -0.0599,  0.0452,  0.1205]], grad_fn=<CatBackward>), batch_sizes=tensor([4, 4, 3, 3, 2]), sorted_indices=None, unsorted_indices=None)

In [18]:
outs, lengths = pad_packed_sequence(packed_outs, batch_first=True)
print(outs)
print(lengths)

tensor([[[ 0.1599, -0.2296, -0.0549,  0.0859],
         [-0.0967, -0.0438,  0.1588,  0.2738],
         [ 0.1514, -0.0751,  0.0661,  0.2950],
         [-0.1441, -0.0259,  0.1622,  0.3950],
         [-0.3265,  0.0767,  0.0860,  0.2457]],

        [[-0.1351, -0.1073,  0.1882,  0.0390],
         [-0.1369,  0.0257,  0.0899,  0.3400],
         [-0.3010,  0.0029,  0.2636,  0.1700],
         [-0.3154, -0.0406,  0.1405,  0.1216],
         [ 0.1149, -0.0599,  0.0452,  0.1205]],

        [[-0.0206, -0.0524,  0.1111,  0.2702],
         [-0.1503,  0.0542,  0.0921,  0.2098],
         [-0.2395, -0.0082,  0.1821,  0.1473],
         [-0.1410, -0.0510,  0.1272,  0.0702],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 0.3011, -0.0614,  0.0943,  0.0871],
         [ 0.2555, -0.1526,  0.1258, -0.1127],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<TransposeBackward0>)
tensor([5, 5, 4, 2])


In [19]:
hidden

(tensor([[[-0.3265,  0.0767],
          [ 0.1149, -0.0599],
          [-0.1410, -0.0510],
          [ 0.2555, -0.1526]],
 
         [[-0.0549,  0.0859],
          [ 0.1882,  0.0390],
          [ 0.1111,  0.2702],
          [ 0.0943,  0.0871]]], grad_fn=<StackBackward>),
 tensor([[[-0.5174,  0.1323],
          [ 0.1446, -0.3065],
          [-0.2064, -0.1436],
          [ 0.4362, -0.5133]],
 
         [[-0.5596,  0.1723],
          [ 0.4508,  0.1184],
          [ 0.3183,  0.4854],
          [ 0.4917,  0.1074]]], grad_fn=<StackBackward>))

## LSTM: Multi-Layer

In [20]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
bi_lstm = nn.LSTM(3, 2, batch_first=True, bidirectional=True, num_layers=2)

# Masked input & output
# 5 seq length, 4 batch size, 3 embedding dimensions. 
ins = torch.randn(4, 5, 3)

# The sequences have to be sorted in a decreasing order according to their lengths. 
# The four sentences have 5, 5, 4, 2 valid words, respectively. 
packed_ins = pack_padded_sequence(ins, lengths=[5, 5, 4, 2], batch_first=True)
packed_ins

PackedSequence(data=tensor([[-1.4777, -1.7557,  0.0762],
        [ 0.3904, -0.0394, -0.8015],
        [ 1.6953,  2.0655, -0.2340],
        [-1.2320,  0.6257, -1.2231],
        [-1.0786,  1.4403, -0.1106],
        [-0.4955, -0.3615,  0.5851],
        [ 0.7073,  0.5800,  0.2683],
        [-0.6232, -0.2162, -0.4887],
        [ 0.5769, -0.1692, -0.0640],
        [-1.1560, -0.1434, -0.1947],
        [-2.0589,  0.5340, -0.5354],
        [ 1.0384,  0.9068, -0.4755],
        [-0.0856,  1.3945,  0.5969],
        [-0.8637, -0.0235,  1.1717],
        [-0.8707,  0.1447,  1.9029],
        [-0.4828, -0.3661, -1.3271]]), batch_sizes=tensor([4, 4, 3, 3, 2]), sorted_indices=None, unsorted_indices=None)

In [21]:
# Fisrt dim: num_layers * num_directions
hidden = (torch.zeros(2*2, 4, 2),
          torch.zeros(2*2, 4, 2))

packed_outs, hidden = bi_lstm(packed_ins, hidden)
packed_outs

PackedSequence(data=tensor([[ 3.9679e-02,  3.9668e-02,  9.9274e-04, -3.3757e-01],
        [ 4.8100e-02,  1.1329e-01,  1.9735e-02, -3.2491e-01],
        [ 5.0718e-02,  1.4100e-01,  2.7250e-02, -3.2355e-01],
        [ 3.7809e-02,  5.7408e-02, -1.7073e-02, -3.5471e-01],
        [ 6.4809e-02,  9.5330e-02,  1.5299e-03, -3.6686e-01],
        [ 6.6838e-02,  8.3767e-02,  1.1935e-02, -3.2278e-01],
        [ 7.6019e-02,  1.6800e-01,  3.1728e-02, -3.0029e-01],
        [ 5.6300e-02,  5.6807e-02, -1.9340e-02, -2.9829e-01],
        [ 7.3877e-02,  1.2904e-01,  2.2365e-02, -3.2285e-01],
        [ 7.4595e-02,  6.2288e-02, -3.5284e-03, -3.5036e-01],
        [ 8.5724e-02,  1.0087e-01,  2.5473e-03, -3.2244e-01],
        [ 8.4051e-02,  1.6466e-01,  2.1183e-02, -3.0374e-01],
        [ 8.1718e-02,  1.3030e-01,  3.0468e-03, -3.4926e-01],
        [ 8.2684e-02,  9.0221e-02, -3.1224e-04, -2.6496e-01],
        [ 8.6588e-02,  1.2812e-01,  2.0970e-02, -2.1160e-01],
        [ 7.0915e-02,  8.8007e-02, -5.3031e-03, -2

In [22]:
outs, lengths = pad_packed_sequence(packed_outs, batch_first=True)
print(outs)
print(lengths)

tensor([[[ 3.9679e-02,  3.9668e-02,  9.9274e-04, -3.3757e-01],
         [ 6.4809e-02,  9.5330e-02,  1.5299e-03, -3.6686e-01],
         [ 7.3877e-02,  1.2904e-01,  2.2365e-02, -3.2285e-01],
         [ 8.4051e-02,  1.6466e-01,  2.1183e-02, -3.0374e-01],
         [ 8.6588e-02,  1.2812e-01,  2.0970e-02, -2.1160e-01]],

        [[ 4.8100e-02,  1.1329e-01,  1.9735e-02, -3.2491e-01],
         [ 6.6838e-02,  8.3767e-02,  1.1935e-02, -3.2278e-01],
         [ 7.4595e-02,  6.2288e-02, -3.5284e-03, -3.5036e-01],
         [ 8.1718e-02,  1.3030e-01,  3.0468e-03, -3.4926e-01],
         [ 7.0915e-02,  8.8007e-02, -5.3031e-03, -2.8080e-01]],

        [[ 5.0718e-02,  1.4100e-01,  2.7250e-02, -3.2355e-01],
         [ 7.6019e-02,  1.6800e-01,  3.1728e-02, -3.0029e-01],
         [ 8.5724e-02,  1.0087e-01,  2.5473e-03, -3.2244e-01],
         [ 8.2684e-02,  9.0221e-02, -3.1224e-04, -2.6496e-01],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],

        [[ 3.7809e-02,  5.7408e-02, -1.7073e-02, 

In [23]:
hidden

(tensor([[[-0.3424,  0.1747],
          [ 0.1782, -0.0752],
          [-0.0449,  0.0524],
          [ 0.2657, -0.1505]],
 
         [[-0.0359,  0.2987],
          [ 0.1657,  0.0949],
          [ 0.2546,  0.1280],
          [ 0.1036,  0.2277]],
 
         [[ 0.0866,  0.1281],
          [ 0.0709,  0.0880],
          [ 0.0827,  0.0902],
          [ 0.0563,  0.0568]],
 
         [[ 0.0010, -0.3376],
          [ 0.0197, -0.3249],
          [ 0.0273, -0.3236],
          [-0.0171, -0.3547]]], grad_fn=<StackBackward>),
 tensor([[[-0.4188,  0.3046],
          [ 0.2505, -0.2792],
          [-0.0534,  0.1012],
          [ 0.3787, -0.4299]],
 
         [[-0.4642,  0.4800],
          [ 0.4496,  0.1709],
          [ 0.3041,  0.4286],
          [ 0.4325,  0.2871]],
 
         [[ 0.3096,  0.2118],
          [ 0.2702,  0.1278],
          [ 0.2827,  0.1391],
          [ 0.1959,  0.0818]],
 
         [[ 0.0031, -0.5341],
          [ 0.0592, -0.5119],
          [ 0.0822, -0.4914],
          [-0.0508, -0.5

# An LSTM for Part-of-Speech Tagging

In [24]:
# Data
train_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word2idx = {}
tag2idx = {}
for sent, tags in train_data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
    for tag in tags:
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)
print(word2idx)
print(tag2idx)

EMB_DIM = 6
HIDDEN_DIM = 6
VOC_SIZE = len(word2idx)
TAGSET_SIZE = len(tag2idx)

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{'DET': 0, 'NN': 1, 'V': 2}


In [25]:
class LSTMTagger(nn.Module):
    def __init__(self, emb_dim, hidden_dim, voc_size, tagset_size):
        super(LSTMTagger, self).__init__()        
        self.word_emb = nn.Embedding(voc_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
        self.hidden_0 = (torch.zeros(1, 1, hidden_dim),
                         torch.zeros(1, 1, hidden_dim))
        
    def forward(self, sent):
        emb = self.word_emb(sent)
        # Make one sentence as a batch. 
        lstm_outs, hidden = self.lstm(emb.view(1, sent.size(0), -1), self.hidden_0)
        
        # Scores for each word in the sentence. 
        tag_space = self.hidden2tag(lstm_outs.view(sent.size(0), -1))
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

In [26]:
model = LSTMTagger(EMB_DIM, HIDDEN_DIM, VOC_SIZE, TAGSET_SIZE)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

sent_idxes = [word2idx[w] for w in train_data[0][0]]
sent_ins = torch.tensor(sent_idxes, dtype=torch.long)
tag_scores = model(sent_ins)

# Scores for each word in the sentence. 
print(tag_scores)

tensor([[-1.4727, -1.2174, -0.7451],
        [-1.4879, -1.1658, -0.7712],
        [-1.4218, -1.1337, -0.8281],
        [-1.2431, -1.0778, -0.9911],
        [-1.3597, -1.3337, -0.7345]], grad_fn=<LogSoftmaxBackward>)


In [27]:
for epoch in range(300):
    for sent, tags in train_data:
        model.zero_grad()
        
        sent_idxes = [word2idx[w] for w in sent]
        sent_ins = torch.tensor(sent_idxes, dtype=torch.long)
        tag_idxes = [tag2idx[tag] for tag in tags]
        targets = torch.tensor(tag_idxes, dtype=torch.long)
        
        tag_scores = model(sent_ins)
        loss = loss_func(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
sent_idxes = [word2idx[w] for w in train_data[0][0]]
sent_ins = torch.tensor(sent_idxes, dtype=torch.long)
tag_scores = model(sent_ins)
print(tag_scores)
print(tag_scores.argmax(dim=1))

tensor([[-0.1235, -3.0423, -2.6821],
        [-4.4550, -0.0218, -4.6157],
        [-3.1474, -4.6781, -0.0537],
        [-0.0183, -5.6775, -4.2190],
        [-3.8691, -0.0278, -5.0329]], grad_fn=<LogSoftmaxBackward>)
tensor([0, 1, 2, 0, 1], grad_fn=<NotImplemented>)


In [28]:
[tag2idx[t] for t in train_data[0][1]]

[0, 1, 2, 0, 1]

# RNN: Recurrent Neural Network

![RNN](fig/RNN.jpg)

Math formulas: 
$$
h_t = \tanh(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})
$$

In [29]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
rnn = nn.RNN(3, 2)

print(rnn.weight_ih_l0)
print(rnn.weight_hh_l0)
print(rnn.bias_ih_l0)
print(rnn.bias_hh_l0)

Parameter containing:
tensor([[-0.0053,  0.3793, -0.5820],
        [-0.5204, -0.2723,  0.1896]], requires_grad=True)
Parameter containing:
tensor([[-0.0140,  0.5607],
        [-0.0628,  0.1871]], requires_grad=True)
Parameter containing:
tensor([-0.2137, -0.1390], requires_grad=True)
Parameter containing:
tensor([-0.6755, -0.4683], requires_grad=True)


In [30]:
# 1 seq length, 1 batch size, 3 embedding dimensions. 
in1 = torch.randn(1, 1, 3)
print(in1)

# Initialize the hidden state, which is h_0 in RNN structure picture. 
hidden0 = torch.randn(1, 1, 2)

# Pass throught the RNN cell. 
out1, hidden1 = rnn(in1, hidden0)
print(out1)
print(hidden1)

tensor([[[-0.7831,  1.0622, -0.2613]]])
tensor([[[-0.1114, -0.4837]]], grad_fn=<StackBackward>)
tensor([[[-0.1114, -0.4837]]], grad_fn=<StackBackward>)


In [31]:
# Manually calculate the RNN
# Shape of (1, 2)
W_ih_xb = in1[0].mm(rnn.weight_ih_l0.T) + rnn.bias_ih_l0
W_hh_xb = hidden0[0].mm(rnn.weight_hh_l0.T) + rnn.bias_hh_l0

# Shape of (1, 2)
h1 = F.tanh((W_ih_xb + W_hh_xb))

print(h1)

tensor([[-0.1114, -0.4837]], grad_fn=<TanhBackward>)


# GRU: Gated Recurrent Unit

![GRU](fig/RNN-vs-LSTM-vs-GRU.jpg)

Math formulas: 
$$
\begin{aligned}
r_t &= \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
z_t &= \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
n_t &= \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
h_t &= (1 - z_t) * n_t + z_t * h_{(t-1)} \\
\end{aligned}
$$

In [32]:
# Specify the random seed. 
torch.manual_seed(0)

# 3 input dimensions, 2 hidden dimensions, 1 layer. 
gru = nn.GRU(3, 2)

# (W_ir|W_iz|W_in)
print(gru.weight_ih_l0)
# (W_hr|W_hz|W_hn)
print(gru.weight_hh_l0)
# (b_ir|b_iz|b_in)
print(gru.bias_ih_l0)
# (b_hr|b_hz|b_hn)
print(gru.bias_hh_l0)

Parameter containing:
tensor([[-0.0053,  0.3793, -0.5820],
        [-0.5204, -0.2723,  0.1896],
        [-0.0140,  0.5607, -0.0628],
        [ 0.1871, -0.2137, -0.1390],
        [-0.6755, -0.4683, -0.2915],
        [ 0.0262,  0.2795,  0.4243]], requires_grad=True)
Parameter containing:
tensor([[-0.4794, -0.3079],
        [ 0.2568,  0.5872],
        [-0.1455,  0.5291],
        [-0.1140,  0.0748],
        [ 0.6403, -0.6560],
        [-0.4452, -0.1790]], requires_grad=True)
Parameter containing:
tensor([-0.2756,  0.6109, -0.4583, -0.3255, -0.4940, -0.6622],
       requires_grad=True)
Parameter containing:
tensor([-0.4128,  0.6078,  0.3155,  0.3427,  0.0372, -0.3625],
       requires_grad=True)


In [33]:
# 1 seq length, 1 batch size, 3 embedding dimensions. 
in1 = torch.randn(1, 1, 3)
print(in1)

# Initialize the hidden state, which is h_0 in GRU structure picture. 
hidden0 = torch.randn(1, 1, 2)

# Pass throught the GRU cell. 
out1, hidden1 = gru(in1, hidden0)
print(out1)
print(hidden1)

tensor([[[-0.2142, -0.4320, -0.7079]]])
tensor([[[ 0.2779, -1.0391]]], grad_fn=<StackBackward>)
tensor([[[ 0.2779, -1.0391]]], grad_fn=<StackBackward>)


In [34]:
# Manually calculate the GRU
# Shape of (1, 2*3)
W_ih_xb = in1[0].mm(gru.weight_ih_l0.T) + gru.bias_ih_l0
W_hh_xb = hidden0[0].mm(gru.weight_hh_l0.T) + gru.bias_hh_l0

# Shape of (1, 2)
r1 = F.sigmoid((W_ih_xb + W_hh_xb)[:, 0:2])
z1 = F.sigmoid((W_ih_xb + W_hh_xb)[:, 2:4])
n1 = F.tanh(W_ih_xb[:, 4:6] + r1 * W_hh_xb[:, 4:6])
h1 = (1 - z1) * n1 + z1 * hidden0[0]

print(h1)

tensor([[ 0.2779, -1.0391]], grad_fn=<AddBackward0>)
