# RNN
1. 주어진 데이터를 RNN에 넣을 수 있는 형태로 만든다.
2. 기본적인 RNN 사용법 및 적용법을 익힌다.
3. packedsequence의 필요성에 대해 배우고 적용법을 실습한다.

## 필요 패키지 import

In [2]:
from tqdm import tqdm
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch

## 데이터 전처리
아래의 sample data 를 확인
전체 단어 수와 pad token의 id도 아래와 같음.

In [3]:
vocab_size = 100
pad_id = 0

data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

Padding 처리를 해주면서 padding 전 길이도 저장

In [4]:
max_len = len(max(data, key = len))
print(f"Maximum sequence length: {max_len}")

valid_lens = []
for i, seq in enumerate(tqdm(data)):
  valid_lens.append(len(seq))
  if len(seq)< max_len:
    data[i] = seq + [pad_id]*(max_len-len(seq))

100%|██████████| 10/10 [00:00<00:00, 5852.25it/s]

AMaximum sequence length: 20





In [5]:
print(data)
print(valid_lens)

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13], [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0], [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0], [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0], [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]
[20, 5, 8, 10, 15, 18, 17, 6, 6, 18]


위 데이터를 하나의 batch로 만들어 실습에 이용

In [6]:
batch = torch.LongTensor(data)
batch_lens = torch.LongTensor(valid_lens)

## RNN 사용해보기
RNN에 넣기 전 word embedding을 위한 embedding layer를 만듭니다.

In [7]:
embedding_size = 256
embedding = nn.Embedding(vocab_size, embedding_size)

batch_emb = embedding(batch)

아래와 같이 RNN 모델 및 초기 hidden state를 정의

In [8]:
hidden_size = 512
num_layers = 1
num_dirs = 1

rnn = nn.RNN(
    input_size = embedding_size,
    hidden_size = hidden_size,
    num_layers = num_layers,
    bidirectional = True if num_dirs>1 else False
)

h_0 = torch.zeros((num_layers*num_dirs, batch.shape[0], hidden_size))

RNN에 batch data를 넣으면 아래와 같이 2가지 output을 얻음
- hidden_states: 각 time step에 해당되는 hidden state들의 묶음
- h_n: 모든 sequence를 거치고 나온 마지막 hidden state

In [10]:
hidden_states, h_n = rnn(batch_emb.transpose(0, 1), h_0)

print(hidden_states.shape)
print(h_n.shape)

torch.Size([20, 10, 512])
torch.Size([1, 10, 512])


## RNN 활용법
마지막 hidden state를 이용하여 text classification task에 적용할 수 있음

In [11]:
num_classes = 2
classification_layer = nn.Linear(hidden_size, num_classes)

output = classification_layer(h_n.squeeze(0))
print(output.shape)

torch.Size([10, 2])


각 time step에 대한 hidden state를 이용하여 token-level task를 수행할 수도 있음

In [12]:
num_classes = 5
entity_layer = nn.Linear(hidden_size, num_classes)

output = entity_layer(hidden_states)
print(output.shape)

torch.Size([20, 10, 5])


## PackedSequence 사용법
앞서 padding 처리했던 데이터를 다시 확인

In [13]:
data

[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13],
 [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0],
 [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0],
 [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0],
 [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]

In [14]:
sorted_lens, sorted_idx = batch_lens.sort(descending=True)
sorted_batch = batch[sorted_idx]

print(sorted_batch)
print(sorted_lens)

tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([2

pack_padded_sequence를 이용하여 PackedSequence obkect를 사용

In [15]:
sorted_batch_emb = embedding(sorted_batch)
packed_batch = pack_padded_sequence(sorted_batch_emb.transpose(0, 1), sorted_lens)

print(packed_batch)
print(packed_batch[0].shape)

PackedSequence(data=tensor([[-1.5062, -1.2851,  0.0734,  ..., -0.5527, -0.8514, -0.0896],
        [ 0.6514, -0.8530,  0.4354,  ...,  1.0369, -0.5971,  2.0721],
        [ 0.5329, -1.6209,  1.2343,  ...,  1.0563, -0.9422,  0.7974],
        ...,
        [-0.7497, -1.8698, -0.0186,  ...,  0.3211, -0.3768, -0.2142],
        [-0.3757,  1.1620, -1.4107,  ...,  0.2446, -0.3722, -0.0692],
        [-1.8552, -1.0839, -0.0381,  ...,  0.9833,  0.1891,  0.5047]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 256])


In [16]:
packed_outputs, h_n = rnn(packed_batch, h_0)

print(packed_outputs)
print(packed_outputs[0].shape)
print(h_n.shape)

PackedSequence(data=tensor([[-0.0615,  0.4742, -0.6462,  ...,  0.1795,  0.5001,  0.4599],
        [ 0.0331, -0.7967, -0.0922,  ..., -0.4491,  0.1275, -0.2828],
        [ 0.0009,  0.0765, -0.2428,  ..., -0.0012, -0.2997,  0.3682],
        ...,
        [ 0.2131,  0.6968,  0.2036,  ..., -0.3289,  0.5836,  0.7497],
        [ 0.7189,  0.2809, -0.2263,  ...,  0.7283, -0.6944, -0.2432],
        [-0.2386,  0.1049,  0.3530,  ...,  0.3856, -0.2154,  0.2319]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
torch.Size([123, 512])
torch.Size([1, 10, 512])


packed_output은 PackedSequence이므로 원래 output형태와 다름
이를 다시 원래 형태로 바꿔주기 위해 pad_packed_sequence를 이용

In [19]:
outputs, outputs_lens = pad_packed_sequence(packed_outputs)

print(outputs.shape)
print(outputs_lens)

torch.Size([20, 10, 512])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])
