# 3. 실습: RNN

1. 주어진 데이터를 RNN에 넣을 수 있는 형태로 만듭니다.
2. 기본적인 RNN 사용법 및 적용법을 익힙니다.
3. PackedSquence의 필요성에 대해 배우고 적용법을 실습합니다.

<br>

## 3.1 필요 패키지 import

In [None]:
from tqdm import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch

<br>

## 3.2 데이터 전처리

- 아래의 sample data를 확인해봅시다.  
- 전체 단어 수와 pad token의 id도 아래와 같습니다.

In [None]:
vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

In [None]:
len(data)

10

In [None]:
min([min(v) for v in data]), max([max(v) for v in data])

(0, 99)

<br>

- Padding 처리를 해주면서 padding 전 길이도 저장합니다.

In [None]:
max_len = len(max(data, key=len))
print(f"Maximum sequence length: {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
    valid_lens.append(len(seq))
    if len(seq) < max_len:
        data[i] = seq + [pad_id] * (max_len - len(seq))

Maximum sequence length: 20


100%|██████████| 10/10 [00:00<00:00, 11052.18it/s]


In [None]:
for d in data:
    print(d)

[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13]
[62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0]
[58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0]
[22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0]
[94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]


In [None]:
print(valid_lens)

[20, 5, 8, 10, 15, 18, 17, 6, 6, 18]


<br>

- 위 데이터를 하나의 batch 로 만들어 실습에 이용하겠습니다.

In [None]:
# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data) # (B, L)
batch_lens = torch.LongTensor(valid_lens) # (B,)

<br>

## 3.3 RNN 사용해보기

- RNN 에 넣기 전 word embedding 을 위한 embedding layer 를 만듭니다.

In [None]:
embedding_size = 256
embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)

# d_w: embedding size
batch_emb = embedding(batch) # (B, L, d_w)
print(batch_emb.size())

torch.Size([10, 20, 256])


<br>

- 아래와 같이 RNN 모델 및 초기 hidden state를 정의합니다.

In [None]:
hidden_size = 512  # RNN의 hidden size
num_layers = 1  # 쌓을 RNN layer의 개수
num_dirs = 1  # 1: 단방향 RNN, 2: 양방향 RNN

rnn = nn.RNN(
    input_size=embedding_size, # 임베딩된 단어의 차원
    hidden_size=hidden_size, # hidden state 의 차원
    num_layers=num_layers,
    bidirectional=True if num_dirs > 1 else False
)

h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size)) # (num_layers * num_dirs, B, d_h)
print(h_0.size())

torch.Size([1, 10, 512])


<br>

- RNN 에 batch data 를 넣으면 아래와 같이 2가지 output 을 얻는다.
  - `hidden_states`: 각 time step 에 해당하는 hidden state 들의 묶음
  - `h_n`: 모든 sequence 를 거치고 나온 마지막 hidden state

In [None]:
print(batch_emb.size())
print(batch_emb.transpose(0,1).size())

torch.Size([10, 20, 256])
torch.Size([20, 10, 256])


In [None]:
hidden_states, h_n = rnn(batch_emb.transpose(0, 1), h_0)

# d_h: hidden size, num_layers: layer 개수, num_dirs: 방향의 개수
print(hidden_states.shape)  # (L, B, d_h)
print(h_n.shape)  # (num_layers*num_dirs, B, d_h) = (1, B, d_h)

torch.Size([20, 10, 512])
torch.Size([1, 10, 512])


<br>

## 3.4 RNN 활용법

- 마지막 hidden state 를 이용하여 text classification task 에 적용할 수 있습니다.

In [None]:
num_classes = 2
classification_layer = nn.Linear(hidden_size, num_classes)

# C: number of classes
output = classification_layer(h_n.squeeze(0)) # (1, B, d_h) -> (B, C)
print(output.shape)

torch.Size([10, 2])


<br>

- 각 time step 에 대한 hidden state 를 이용하여 token-level 의 task 를 수행할 수도 있습니다.

In [None]:
num_classes = 5
entity_layer = nn.Linear(hidden_size, num_classes)

# C: number of classes
output = entity_layer(hidden_states) # (L, B, d_h) -> (L, B, C)
print(output.shape)

torch.Size([20, 10, 5])


<br>

## 3.5 PackedSequence 사용법

- 앞서 padding 처리했던 데이터를 다시 확인해봅시다.

In [None]:
data

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13],
 [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0],
 [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0],
 [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0],
 [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]

<br>

- 아래 그림과 같이 불필요한 pad 계산이 포함됩니다.

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src='https://drive.google.com/uc?id=1AmyvjeWuEDtzFpuYZ11ItfV3DTe832xs' width=800/>

<br>

- 데이터를 padding 전 원래 길이 기준으로 정렬합니다.

In [None]:
sorted_lens, sorted_idx = batch_lens.sort(descending=True)
sorted_batch = batch[sorted_idx]

In [None]:
print(sorted_batch)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])


In [None]:
print(sorted_lens)

tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])


<br>

- 아래와 같은 padding 무시 효과를 얻을 수 있습니다.

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<img src='https://drive.google.com/uc?id=1lz3wvWAD0EkKsOqJ0C9q9MPvOTObfKH1' width=800/>

<br>

- `pack_padded_sequence` 를 이용하여 PackedSequence object 를 사용한다.

In [None]:
sorted_batch_emb = embedding(sorted_batch)
packed_batch = pack_padded_sequence(sorted_batch_emb.transpose(0, 1), sorted_lens)

print(packed_batch)
print(packed_batch[0].shape)

PackedSequence(data=tensor([[ 0.5675,  0.0037, -1.0423,  ...,  0.1366,  0.3375, -0.1567],
        [ 0.3605,  0.4326,  0.6642,  ...,  0.2936, -1.2819, -1.9344],
        [-0.8049, -0.1083, -0.2633,  ...,  1.3228, -0.5326, -0.7870],
        ...,
        [-0.8811, -0.3274,  0.7290,  ...,  2.0362, -1.2792, -1.2598],
        [-0.9903,  0.8745,  2.5449,  ..., -0.0331, -1.5226,  1.4135],
        [-0.6412, -1.4564, -0.5998,  ...,  0.3748, -1.2355,  0.3707]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 256])


In [None]:
packed_outputs, h_n = rnn(packed_batch, h_0)

print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)

PackedSequence(data=tensor([[-0.0704,  0.6465,  0.0291,  ..., -0.3865, -0.3754, -0.1105],
        [-0.2697, -0.1752,  0.0622,  ...,  0.2062,  0.7887,  0.1123],
        [-0.1499,  0.1588,  0.2349,  ...,  0.4513, -0.5608, -0.4134],
        ...,
        [ 0.5305, -0.4452,  0.6720,  ...,  0.6249, -0.1087,  0.3764],
        [ 0.3286,  0.0144, -0.7842,  ...,  0.5720,  0.0443,  0.3746],
        [ 0.4228,  0.3772,  0.4790,  ...,  0.6667, -0.1504, -0.6163]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])


<br>

- `packed_output`은 PackedSquence이므로 원래 output 형태와 다릅니다.  
- 이를 다시 원래 형태로 바꿔주기 위해 `pad_packed_sequence`를 이용합니다.

In [None]:
outputs, outputs_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape)  # (L, B, d_h)
print(outputs_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])
