##**4. LSTM, GRU**
1. 기존 RNN과 다른 부분에 대해서 배웁니다.
2. 이전 실습에 이어 다양한 적용법을 배웁니다.

### **필요 패키지 import**

In [1]:
from tqdm.auto import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch

### **데이터 전처리**

아래의 sample data를 확인해봅시다.  
이전 실습과 동일합니다.

In [2]:
vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

In [3]:
max_len = len(max(data, key = len)) #batch data내의 최대 시퀀스 길이
print(f"Maximum sequence length: {max_len}")

valid_lens = [] #padding 전 시퀀스 길이

for i, seq in enumerate(tqdm(data)):

    valid_lens.append(len(seq))

    if len(seq) < max_len:

        data[i] = seq + [pad_id] * (max_len - len(seq)) #최대 길이보다 작은 시퀀스 패딩

Maximum sequence length: 20


  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
#B: batch size, L: maximum sequence length

batch = torch.LongTensor(data) #(B, L)
batch_lens = torch.LongTensor(valid_lens) #(B)

batch_lens, sorted_idx = batch_lens.sort(descending=True)
batch = batch[sorted_idx]

print(batch)
print(batch_lens)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([2

### **LSTM 사용**

LSTM에선 cell state가 추가됩니다.  
Cell state의 shape는 hidden state의 그것과 동일합니다.

In [5]:
embedding_size = 256
hidden_size = 512
num_layers = 1
num_dirs = 1

embedding = nn.Embedding(vocab_size, embedding_size)

lstm = nn.LSTM(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    bidirectional = True if num_dirs > 1 else False
)

#hidden state의 초기값
h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) #(num_layers * num_dirs, B, d_h)

#cell state의 초기값
c_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) #(num_layers * num_dirs, B, d_h)

In [6]:
#d_w: word embedding size

batch_emb = embedding(batch) #(B, L, d_w)

packed_batch = pack_padded_sequence(batch_emb.transpose(0,1), batch_lens) #unpacking > packing

packed_outputs, (h_n, c_n) = lstm(packed_batch, (h_0, c_0))
print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)
print(c_n.shape)

PackedSequence(data=tensor([[-0.1781, -0.0983, -0.1026,  ..., -0.0290,  0.0718,  0.0746],
        [-0.1070, -0.1882,  0.0805,  ..., -0.1675,  0.0276,  0.0482],
        [ 0.0890,  0.0528, -0.1291,  ..., -0.1900, -0.1401, -0.1384],
        ...,
        [-0.1729,  0.0395,  0.0961,  ...,  0.1353,  0.0452,  0.1362],
        [-0.1118, -0.0988, -0.0047,  ...,  0.0575,  0.0925, -0.1169],
        [-0.0121, -0.3091,  0.0047,  ...,  0.0702,  0.1550, -0.0522]],
       grad_fn=<CatBackward0>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])
torch.Size([1, 10, 512])


In [7]:
outputs, output_lens = pad_packed_sequence(packed_outputs) #packing > unpacking
print(outputs.shape)
print(output_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


### **GRU 사용**

GRU는 cell state가 없어 RNN과 동일하게 사용 가능합니다.   
GRU를 이용하여 LM task를 수행해봅시다.

In [8]:
gru = nn.GRU(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    bidirectional = True if num_dirs > 1 else False
)

output_layer = nn.Linear(hidden_size, vocab_size)

In [11]:
input_id = batch.transpose(0,1)[0,:] #(B)
hidden = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) #(1,B, d_h)
print(input_id)

tensor([85, 58, 87, 22, 70, 19, 93, 94, 80, 62])


In [10]:
print(batch)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])


Teacher forcing 없이 이전에 얻은 결과를 다음 input으로 이용합니다.

In [13]:
#sequence의 맨 첫 토큰(input_id)를 gru에 넣어서, 다음 토큰(top_id)를 길이 20만큼 생성하는 모델

for t in range(max_len):

    input_emb = embedding(input_id).unsqueeze(0) #(1,B, d_w)
    output, hidden = gru(input_emb, hidden) #output: (1,B, d_h), hidden: (1,B,d_h)

    #V: vocab size
    output = output_layer(output) #(1,B,V)
    probs, top_id = torch.max(output, dim = -1) #probs: (1,B), top_id:(1,B)

    print("*"*50)
    print(f"Time step: {t}")
    print(output.shape)
    print(probs.shape)
    print(top_id.shape)

    input_id = top_id.squeeze(0) #(B)
    print(input_id)


**************************************************
Time step: 0
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
tensor([96, 37, 30, 41, 24, 53, 63, 96, 53, 25])
**************************************************
Time step: 1
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
tensor([83, 25, 30, 41, 74, 53, 49, 83, 53, 71])
**************************************************
Time step: 2
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
tensor([78, 71, 30,  6, 96, 53, 32, 78, 53, 66])
**************************************************
Time step: 3
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
tensor([50, 66, 30,  6, 83, 53, 14, 50, 53, 97])
**************************************************
Time step: 4
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
tensor([78, 97, 30, 14, 78, 53, 24, 78, 53, 59])
**************************************************
Time step: 5
torch.Size([1, 10, 100])
torch.Size([1, 10])
t

`max_len`만큼의 for 문을 돌면서 모든 결과물의 모양을 확인했지만 만약 종료 조건(예를 들어 문장의 끝을 나타내는 end token 등)이 되면 중간에 생성을 그만둘 수도 있습니다.

### **양방향 및 여러 layer 사용**

이번엔 양방향 + 2개 이상의 layer를 쓸 때 얻을 수 있는 결과에 대해 알아봅니다.

In [14]:
num_layers = 2
num_dirs = 2
dropout = 0.1

gru = nn.GRU(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    dropout = dropout,
    bidirectional = True if num_dirs > 1 else False
)

Bidirectional이 되었고 layer의 개수가 $2$로 늘었기 때문에 hidden state의 shape도 `(4, B, d_h)`가 됩니다.

In [15]:
# d_w: word embedding size, num_layers: layer의 개수, num_dirs: 방향의 개수
batch_emb = embedding(batch) #(B,L, d_w)
h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) #(num_layers * num_dirs, B, d_h) =(4,B,d_h)

packed_batch  = pack_padded_sequence(batch_emb.transpose(0,1), batch_lens) #unpacking > packing

packed_outputs, h_n = gru(packed_batch, h_0)

print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)

PackedSequence(data=tensor([[-0.1087,  0.0186,  0.0140,  ..., -0.0073, -0.0446, -0.1167],
        [-0.1673,  0.1159,  0.0775,  ...,  0.0371,  0.0408, -0.1132],
        [-0.0362,  0.0031,  0.0895,  ..., -0.1158,  0.0877,  0.0352],
        ...,
        [-0.2166, -0.1926, -0.0344,  ...,  0.0584, -0.0657, -0.1129],
        [-0.0354, -0.0105, -0.0502,  ...,  0.0617, -0.1308, -0.0194],
        [-0.0494, -0.0155, -0.0422,  ..., -0.0141, -0.0173,  0.0516]],
       grad_fn=<CatBackward0>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 1024])
torch.Size([4, 10, 512])


In [16]:
#packing > unpacking

outputs, output_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape) #(L,B,num_dirs*d_h)
print(output_lens) #torch.Size([20, 10, 512]) > torch.Size([20, 10, 1024])

torch.Size([20, 10, 1024])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


각각의 결과물의 shape는 다음과 같습니다.

`outputs`: `(max_len, batch_size, num_dir * hidden_size)`  
`h_n`: `(num_layers*num_dirs, batch_size, hidden_size)`

In [17]:
batch_size = h_n.shape[1]

print(h_n.view(num_layers, num_dirs, batch_size, hidden_size))
print(h_n.view(num_layers, num_dirs, batch_size, hidden_size).shape)

tensor([[[[-0.0344, -0.0999, -0.0332,  ...,  0.1596, -0.0114,  0.1092],
          [ 0.0309,  0.0545, -0.3604,  ...,  0.0787, -0.2013, -0.0200],
          [ 0.1993,  0.2800,  0.1776,  ..., -0.0023, -0.0332,  0.0918],
          ...,
          [-0.2209,  0.0403, -0.0859,  ..., -0.1432, -0.1502,  0.0081],
          [ 0.0603,  0.0279,  0.2117,  ...,  0.3436,  0.4257, -0.1044],
          [ 0.1715, -0.2391,  0.0117,  ...,  0.0733, -0.3136,  0.3930]],

         [[ 0.1796,  0.1758,  0.2883,  ..., -0.0791,  0.0096, -0.1504],
          [-0.1571, -0.0368,  0.2900,  ...,  0.1826, -0.2215,  0.3945],
          [ 0.3618, -0.2935,  0.0416,  ..., -0.5553,  0.2999,  0.1469],
          ...,
          [-0.1177,  0.0489,  0.1471,  ...,  0.4341, -0.3699,  0.3331],
          [-0.2041, -0.0821,  0.1842,  ..., -0.1407,  0.2886,  0.0604],
          [ 0.1708,  0.0786,  0.0387,  ..., -0.1393,  0.1067,  0.0362]]],


        [[[-0.0494, -0.0155, -0.0422,  ...,  0.2201,  0.0925, -0.0751],
          [-0.3110,  0.0594,