# 2021/02/16 Lecture Summary 

## LSTM

![](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)
[출처](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
$$
f_t = \sigma(W_f[h_{t-1}, x_t] + b_f) \\
i_t = \sigma(W_i[h_{t-1}, x_t] + b_i) \\
\tilde{C_t} = \text{tanh}(W_c[h_{t-1}, x_t] + b_c) \\
C_t = f_t * C_{t-1} + i_t * \tilde{C_t} \\
o_t = \sigma(W_o[h_{t-1}, x_t] + b_o) \\
h_t = o_t\text{tanh}(C_t)
$$


In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.6.0].
device:[cpu].


In [3]:
class LSTMModel(nn.Module):
    '''
    Code reference
        https://colab.research.google.com/github/sjchoi86/upstage-basic-deeplearning/blob/main/notebook/lstm.ipynb
        https://drive.google.com/file/d/1DwMbVf9KVvWtGz_rGyhM1uXkmc33JUlM/view
    '''
    def __init__(self, x_dim, embedding_dim, h_dim, n_layers, n_dirs, use_pack=False):
        super(LSTMModel, self).__init__()
        self.x_dim = x_dim
        self.embedding_dim = embedding_dim
        self.h_dim = h_dim
        self.n_layers = n_layers
        self.n_dirs = n_dirs
        self.use_pack = use_pack
        self.embedding = nn.Embedding(x_dim, embedding_dim)
        self.rnn = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.h_dim,
            num_layers=self.n_layers,
            bidirectional=True if n_dirs > 1 else False
        )
        
    def forward(self, x, valid_len, device):
        h0 = torch.zeros(
            self.n_layers * self.n_dirs, x.size(0), self.h_dim
        ).to(device)
        c0 = torch.zeros(
            self.n_layers * self.n_dirs, x.size(0), self.h_dim
        ).to(device)
        # Embedding
        x_embed = self.embedding(x)
        
        # Packing padded seq
        if self.use_pack:
            x_embed = pack_padded_sequence(x_embed.transpose(0,1), valid_len)
        else:
            x_embed = x_embed.transpose(0, 1)
        rnn_out,(hn,cn) = self.rnn(x_embed, (h0,c0))
        
        # Padding Packed seq
        if self.use_pack:
            rnn_out = pad_packed_sequence(rnn_out)
        return rnn_out, (hn, cn)

## GRU
![](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-var-GRU.png)
[출처](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
* LSTM을 경량화한 모델


In [4]:
class GRUModel(nn.Module):
    def __init__(self, x_dim, embedding_dim, h_dim, n_layers, n_dirs, use_pack=False):
        super(GRUModel, self).__init__()
        self.x_dim = x_dim
        self.embedding_dim = embedding_dim
        self.h_dim = h_dim
        self.n_layers = n_layers
        self.n_dirs = n_dirs
        self.use_pack = use_pack
        self.embedding = nn.Embedding(x_dim, embedding_dim)
        self.rnn = nn.GRU(
            input_size=embedding_dim,
            hidden_size=h_dim,
            num_layers=n_layers,
            bidirectional=True if n_dirs > 1 else False
        )
        
    def forward(self, x, valid_len, device):
        h0 = torch.zeros(
            self.n_layers * self.n_dirs, x.size(0), self.h_dim
        ).to(device)
        x_embed = self.embedding(x)
        if self.use_pack:
            x_embed = pack_padded_sequence(x_embed.transpose(0,1), valid_len)
        else:
            x_embed = x_embed.transpose(0, 1)
        gru_out, hn = self.rnn(x_embed, h0)
        if self.use_pack:
            gru_out = pad_packed_sequence(gru_out)
        return gru_out, hn

In [5]:
# Data Preprocessing
from tqdm import tqdm

vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]
max_len = len(max(data, key=len))
print(f"Maximum sequence length: {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
    valid_lens.append(len(seq))
    if len(seq) < max_len:
        data[i] = seq + [pad_id] * (max_len - len(seq))
    
# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data)  # (B, L)
batch_lens = torch.LongTensor(valid_lens)  # (B)

batch_lens, sorted_idx = batch_lens.sort(descending=True)
batch = batch[sorted_idx]

print(batch)
print(batch_lens)

100%|██████████| 10/10 [00:00<00:00, 27850.62it/s]

Maximum sequence length: 20
tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,





In [6]:
# One-Layer LSTM & GRU w.o. Bidirectional
embedding_size = 256
hidden_size = 512
num_layers = 1
num_dirs = 1

lstm_model = LSTMModel(vocab_size, embedding_size, hidden_size, num_layers, num_dirs, True)
gru_model = GRUModel(vocab_size, embedding_size, hidden_size, num_layers, num_dirs, True)

In [7]:
lstm_outputs, (h_lstm, c_lstm) = lstm_model(batch, batch_lens, device)
#print(lstm_outputs)
print(lstm_outputs[0].shape)
print(h_lstm.shape)
print(c_lstm.shape)
gru_outputs, h_gru = gru_model(batch, batch_lens, device)
#print(gru_outputs)
print(gru_outputs[0].shape)
print(h_gru.shape)

torch.Size([20, 10, 512])
torch.Size([1, 10, 512])
torch.Size([1, 10, 512])
torch.Size([20, 10, 512])
torch.Size([1, 10, 512])


In [8]:
# Multi-Layer LSTM & GRU w.o. Bidirectional
embedding_size = 256
hidden_size = 512
num_layers = 2
num_dirs = 2

lstm_model = LSTMModel(vocab_size, embedding_size, hidden_size, num_layers, num_dirs, True)
gru_model = GRUModel(vocab_size, embedding_size, hidden_size, num_layers, num_dirs, True)

In [9]:
lstm_outputs, (h_lstm, c_lstm) = lstm_model(batch, batch_lens, device)
print(lstm_outputs[0].shape)
print(h_lstm.shape)
print(c_lstm.shape)
gru_outputs, h_gru = gru_model(batch, batch_lens, device)
print(gru_outputs[0].shape)
print(h_gru.shape)

torch.Size([20, 10, 1024])
torch.Size([4, 10, 512])
torch.Size([4, 10, 512])
torch.Size([20, 10, 1024])
torch.Size([4, 10, 512])


## Bidirectional LSTM & GRU
* Bidirectional을 쓰는 이유?
    * Word2Vec의 Window와 유사한 이유

> 나는 ____를 뒤집어 쓰고 펑펑 울었다.

* 위의 문장에서 빈 칸의 단어를 추론하려면 앞부분이 아닌 뒷부분에 들어가는 단어가 더 중요.
* 정확한 학습을 위해서 일방향이 아닌 양방향 학습을 사용!


![](https://paperswithcode.com/media/methods/Screen_Shot_2020-05-25_at_8.54.27_PM.png)
[출처](https://paperswithcode.com/method/bilstm#)
* 두 개의 LSTM Layer가 각각 학습을 하고, 각 결과를 Concat & Flatten 하여 최종 결과를 내는 형태

## Reference
* Reference: http://intelligence.korea.ac.kr/members/wschoi/nlp/deeplearning/Bidirectional-RNN-and-LSTM/