# Language processing & Transformer

음성 AI를 위한 자연어 처리와 Transformer의 핵심 구조인 Multi-head Attention을 구현하는 실습입니다.
1. 텍스트 전처리 과정 이해
    - tokenizing
    - cleaning
2. Multi-head attention 및 self-attention 구현.
3. 각 과정에서 일어나는 연산과 input/output 형태 이해.

### 필요 패키지 install & import

In [27]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [28]:
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm
import re
import torch
import math

from konlpy.tag import Okt
from tensorflow.keras.preprocessing.text import Tokenizer

## Req. 1-1 텍스트 전처리

주어진 문장 5개를 cleaning, tokenizing 한 뒤 정수 인코딩 하시오.  

원하는 다른 tokenizer를 사용해도 좋습니다.

In [41]:
sentences = [["안녕하세요 음성 AI 실!@습에 오신 것을 환영#$^&@$&$합니다."], ["이네들은 7895435너무나 멀리 있습니다."], 
["계절이 지나가는 하늘에는가을로 가&^%@!$!^득 차 있습니다."], ["아직 나의 청!@$!%춘이 다하지!@% 않은 까닭입니다."], ["가슴 속에 하!@$나 둘 새겨지는 별을"]]

okt = Okt()
tokenizer = Tokenizer()

cleaned_sentences = []
for sentence in sentences:
    # 문장에서 특수문자 제거
    cleaned = re.sub(r"[^가-힣ㄱ-ㅎㅏ-ㅣ\s]", "", sentence[0])
    cleaned_sentences.append(cleaned)

# Konlpy의 Okt를 사용해 문장을 토큰화
tokenized_sentences = []
for sentence in cleaned_sentences:
    tokens = okt.morphs(sentence)
    tokenized_sentences.append(tokens)

# Tokenizer를 사용해 정수 인코딩
tokenizer.fit_on_texts(tokenized_sentences)
encoded_sentences = tokenizer.texts_to_sequences(tokenized_sentences)

print(encoded_sentences)

[[5, 6, 7, 2, 8, 9, 3, 10, 11], [1, 12, 13, 14, 15, 16, 4], [17, 1, 18, 19, 20, 21, 22, 23, 4], [24, 25, 26, 27, 1, 28, 29, 30, 31, 32], [33, 34, 2, 35, 36, 37, 38, 3]]


결과는 다음과 같이 나와야 합니다.  


[[5, 6, 7, 2, 8, 9, 3, 10, 11],  
 [1, 12, 13, 14, 15, 16, 4],  
 [17, 1, 18, 19, 20, 21, 22, 23, 4],  
 [24, 25, 26, 27, 1, 28, 29, 30, 31, 32],  
 [33, 34, 2, 35, 36, 37, 38, 3]]  
 

## Req. 1-2 Multi-head self-attention 구조 익히기

위에서 전처리한 데이터를 가져와 아래 과정을 실행하면서 시퀀스 입력이 multi-head self attention으로 어떻게 모델링 되는지 파악하시오.

In [42]:
pad_id = 0
vocab_size = 40

data = [[5, 6, 7, 2, 8, 9, 3, 10, 11], [1, 12, 13, 14, 15, 16, 4], [17, 1, 18, 19, 20, 21, 22, 23, 4], [24, 25, 26, 27, 1, 28, 29, 30, 31, 32], [33, 34, 2, 35, 36, 37, 38, 3]]

In [43]:
# 길이 맞춰주기 위해 패딩합니다.
def padding(data):
  max_len = len(max(data, key=len))
  print(f"Maximum sequence length: {max_len}")

  for i, seq in enumerate(tqdm(data)):
    if len(seq) < max_len:
      data[i] = seq + [pad_id] * (max_len - len(seq))

  return data, max_len

In [44]:
data, max_len = padding(data)

Maximum sequence length: 10


100%|██████████| 5/5 [00:00<00:00, 8969.85it/s]


In [45]:
data

[[5, 6, 7, 2, 8, 9, 3, 10, 11, 0],
 [1, 12, 13, 14, 15, 16, 4, 0, 0, 0],
 [17, 1, 18, 19, 20, 21, 22, 23, 4, 0],
 [24, 25, 26, 27, 1, 28, 29, 30, 31, 32],
 [33, 34, 2, 35, 36, 37, 38, 3, 0, 0]]

### Hyperparameter 세팅 및 embedding

In [46]:
d_model = 512  # model의 hidden size
num_heads = 8  # head의 개수

# d_model이 입력을 projection 시킬 임베딩 space의 차원이므로, num_heads로 나누어 떨어져야 한다.

In [47]:
embedding = nn.Embedding(vocab_size, d_model)

# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data)  # (B, L)
batch_emb = embedding(batch)  # (B, L, d_model)

In [48]:
print(batch_emb)
print(batch_emb.shape)

tensor([[[ 0.1649,  1.4291, -1.5957,  ..., -1.3773, -2.3836,  0.7976],
         [ 0.4425, -1.3559, -0.1385,  ...,  0.7305, -0.0766, -0.0829],
         [-1.8174, -1.4233,  0.4261,  ...,  1.3366,  0.4890, -0.3784],
         ...,
         [ 0.2151,  1.1739,  1.1136,  ...,  0.9966,  0.6873, -0.1065],
         [-0.4710, -0.0341, -0.4657,  ..., -0.2656,  0.3721,  1.0845],
         [-0.2671,  0.6407, -0.0417,  ..., -0.7076,  0.9632, -1.3252]],

        [[-0.3132, -0.6645, -0.3721,  ...,  0.3150,  0.1436,  1.4154],
         [-2.2975,  0.2167, -1.6276,  ..., -0.8593, -0.5642,  0.2590],
         [ 0.4863,  0.3360, -1.6987,  ..., -0.4397, -1.3170,  0.8126],
         ...,
         [-0.2671,  0.6407, -0.0417,  ..., -0.7076,  0.9632, -1.3252],
         [-0.2671,  0.6407, -0.0417,  ..., -0.7076,  0.9632, -1.3252],
         [-0.2671,  0.6407, -0.0417,  ..., -0.7076,  0.9632, -1.3252]],

        [[ 0.6086, -2.2518,  0.4145,  ...,  0.7958, -1.9388,  1.0116],
         [-0.3132, -0.6645, -0.3721,  ...,  0

### Linear projection & 여러 head로 나누기

Multi-head attention 내에서 쓰이는 linear projection matrix들을 정의합니다.

In [49]:
w_q = nn.Linear(d_model, d_model)
w_k = nn.Linear(d_model, d_model)
w_v = nn.Linear(d_model, d_model)

In [50]:
w_0 = nn.Linear(d_model, d_model)

In [51]:
q = w_q(batch_emb)  # (B, L, d_model)
k = w_k(batch_emb)  # (B, L, d_model)
v = w_v(batch_emb)  # (B, L, d_model)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([5, 10, 512])
torch.Size([5, 10, 512])
torch.Size([5, 10, 512])


Q, k, v를 `num_head`개의 차원 분할된 여러 vector로 만듭니다.

- 이론적으로는 multi-head attention을 수행하면 input을 각각 다른 head 개수만큼의 Wq, Wk, Wv로 linear transformation 해서 각각 여러번의 attention 수행한 후 concat 한 후 linear transformation 수행해준다
- 구현에서는 Wq, Wk, Wv 한 개씩
- 실제 `attention is all you need` 논문의 구현 예시는 Query vector 한개를 dim으로 쪼개서 진행한다

In [52]:
batch_size = q.shape[0]
d_k = d_model // num_heads

# num_heads * d_k로 쪼갠다
q = q.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)
k = k.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)
v = v.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([5, 10, 8, 64])
torch.Size([5, 10, 8, 64])
torch.Size([5, 10, 8, 64])


In [53]:
# num_heads를 밖으로 뺌으로써
# 각 head가 (L, d_k) 만큼의 matrix를 가지고 self-attention 수행

q = q.transpose(1, 2)  # (B, num_heads, L, d_k)
k = k.transpose(1, 2)  # (B, num_heads, L, d_k)
v = v.transpose(1, 2)  # (B, num_heads, L, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([5, 8, 10, 64])
torch.Size([5, 8, 10, 64])
torch.Size([5, 8, 10, 64])


### Scaled dot-product self-attention 구현

각 head에서 실행되는 self-attetion 과정입니다.

In [54]:
# shape - (L, L)
# 같은 sequence 내에 서로 다른 token들에게 얼마나 가중치를 두고 attention을 해야하는가
attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)  # (B, num_heads, L, L)
# softmax - row-wise이기 때문에 dim은 -1
attn_dists = F.softmax(attn_scores, dim=-1)  # (B, num_heads, L, L)

print(attn_dists)
print(attn_dists.shape)

tensor([[[[0.1203, 0.0889, 0.1398,  ..., 0.0566, 0.0950, 0.0793],
          [0.0807, 0.1447, 0.1238,  ..., 0.0945, 0.0765, 0.0876],
          [0.1660, 0.0787, 0.1068,  ..., 0.1316, 0.1139, 0.0767],
          ...,
          [0.0812, 0.1493, 0.1321,  ..., 0.0850, 0.0854, 0.0846],
          [0.0571, 0.1540, 0.0882,  ..., 0.1488, 0.0568, 0.0881],
          [0.1155, 0.0513, 0.1685,  ..., 0.0382, 0.0619, 0.1479]],

         [[0.0883, 0.0609, 0.1218,  ..., 0.0887, 0.1091, 0.1019],
          [0.1105, 0.0967, 0.1160,  ..., 0.0582, 0.0686, 0.1360],
          [0.0976, 0.1040, 0.1511,  ..., 0.0767, 0.0934, 0.1336],
          ...,
          [0.1507, 0.0774, 0.1474,  ..., 0.0640, 0.1208, 0.1014],
          [0.0889, 0.0475, 0.0583,  ..., 0.1148, 0.0949, 0.1348],
          [0.0712, 0.0644, 0.0755,  ..., 0.0549, 0.0594, 0.2382]],

         [[0.1284, 0.0916, 0.0823,  ..., 0.0967, 0.1013, 0.0857],
          [0.0801, 0.0682, 0.2024,  ..., 0.0753, 0.0636, 0.0987],
          [0.0571, 0.0754, 0.1120,  ..., 0

In [55]:
attn_values = torch.matmul(attn_dists, v)  # (B, num_heads, L, d_k)

print(attn_values.shape)

torch.Size([5, 8, 10, 64])


### 각 head의 결과물 병합

각 head의 결과물을 concat하고 동일 차원으로 linear projection합니다.

In [56]:
attn_values = attn_values.transpose(1, 2)  # (B, L, num_heads, d_k)
attn_values = attn_values.contiguous().view(batch_size, -1, d_model)  # (B, L, d_model)

print(attn_values.shape)

torch.Size([5, 10, 512])


In [57]:
# w_0 : (d_model, d_model)
# 서로 다른 의미로 foucsing 된 각 head의 self-attention 정보들을 합쳐주는 역할 수행
outputs = w_0(attn_values)

print(outputs)
print(outputs.shape)

tensor([[[ 0.0994, -0.1570, -0.1709,  ...,  0.0245, -0.0737, -0.1185],
         [ 0.0224, -0.1881, -0.1721,  ...,  0.0708,  0.0220, -0.1557],
         [-0.0166, -0.1710, -0.1809,  ...,  0.0805, -0.0464, -0.1088],
         ...,
         [ 0.0498, -0.1743, -0.1456,  ...,  0.0802, -0.0274, -0.1016],
         [ 0.0944, -0.1896, -0.2113,  ...,  0.0815, -0.0234, -0.1454],
         [-0.0029, -0.1904, -0.1720,  ...,  0.0619, -0.0521, -0.0912]],

        [[ 0.0114, -0.0481, -0.2094,  ...,  0.0385,  0.0205,  0.1621],
         [ 0.0091, -0.0389, -0.1172,  ...,  0.0161, -0.0126,  0.1375],
         [ 0.0087,  0.0373, -0.1128,  ..., -0.0181,  0.0266,  0.1884],
         ...,
         [ 0.1558,  0.0251, -0.1835,  ...,  0.1119, -0.0358,  0.1411],
         [ 0.1558,  0.0251, -0.1835,  ...,  0.1119, -0.0358,  0.1411],
         [ 0.1558,  0.0251, -0.1835,  ...,  0.1119, -0.0358,  0.1411]],

        [[-0.1066, -0.0521,  0.0267,  ...,  0.2098,  0.0577,  0.2374],
         [-0.1114, -0.0409,  0.0949,  ...,  0

## Req. 1-3 Multi-head self-attention 모듈 클래스 구현

위의 과정을 모두 합쳐 하나의 Multi-head attention 모듈 class를 구현하겠습니다.

아래 코드의 TODO 부분을 채워주세요.

In [67]:
class MultiheadAttention(nn.Module):
  def __init__(self):
    super(MultiheadAttention, self).__init__()

    # Q, K, V learnable matrices
    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)

    # Linear projection for concatenated outputs
    self.w_0 = nn.Linear(d_model, d_model)

  # scaled-dot product attention
  def self_attention(self, q, k, v):
    attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)  # (B, num_heads, L, L)
    attn_dists = F.softmax(attn_scores, dim=-1)  # (B, num_heads, L, L)

    attn_values = torch.matmul(attn_dists, v)  # (B, num_heads, L, d_k)

    return attn_values

  def forward(self, q, k, v):
    batch_size = q.shape[0]

    # linear projection
    ################################################################################
    # TODO 1: Implement the forward pass for linear projection.                #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    q = self.w_q(q) # (B, L, d_model)
    k = self.w_k(k) # (B, L, d_model)
    v = self.w_v(v) # (B, L, d_model)
    

    # head만큼 쪼개준다
    ################################################################################
    # TODO 2: Implement the forward pass for split head.                #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    # split heads
    q = q.view(batch_size, -1, num_heads, d_k) # (B, num_heads, L, d_k)
    k = k.view(batch_size, -1, num_heads, d_k) # (B, num_heads, L, d_k)
    v = v.view(batch_size, -1, num_heads, d_k) # (B, num_heads, L, d_k)


    # 각 head가 (L, d_k)의 matrix를 담당하도록 만든다
    q = q.transpose(1, 2)  # (B, num_heads, L, d_k)
    k = k.transpose(1, 2)  # (B, num_heads, L, d_k)
    v = v.transpose(1, 2)  # (B, num_heads, L, d_k)

    attn_values = self.self_attention(q, k, v)  # (B, num_heads, L, d_k)
    attn_values = attn_values.transpose(1, 2).contiguous().view(batch_size, -1, d_model)  # (B, L, num_heads, d_k) => (B, L, d_model)

    return self.w_0(attn_values)

In [68]:
multihead_attn = MultiheadAttention()

outputs = multihead_attn(batch_emb, batch_emb, batch_emb)  # (B, L, d_model)

In [69]:
print(outputs)
print(outputs.shape)  # (batch_size, length, d_model)

tensor([[[-5.7519e-02,  8.3672e-02,  3.5657e-03,  ..., -1.2171e-01,
           1.0617e-01,  1.3804e-02],
         [ 1.9101e-02,  1.9733e-02,  3.3341e-02,  ..., -1.3445e-01,
           3.9254e-02,  1.3933e-02],
         [-1.0876e-01,  1.6079e-02, -3.1137e-02,  ..., -2.2008e-01,
           8.2375e-02,  4.9501e-02],
         ...,
         [ 1.1818e-02,  9.5664e-02, -2.9019e-02,  ..., -3.9713e-02,
           5.2620e-02,  4.7286e-02],
         [ 8.0589e-03,  1.0880e-01, -3.2934e-02,  ..., -1.6008e-01,
           1.5856e-01,  8.1992e-02],
         [-1.5246e-02,  7.1493e-02, -6.8657e-02,  ..., -1.1417e-01,
           5.4012e-02,  3.8931e-02]],

        [[ 1.1694e-01,  7.3469e-02, -1.2465e-01,  ..., -4.6682e-02,
          -1.0961e-01, -2.2308e-01],
         [ 1.3113e-01,  5.8297e-02, -1.5927e-01,  ..., -3.3941e-02,
          -1.6876e-01, -1.8437e-01],
         [ 1.6579e-01,  6.9502e-02, -8.0857e-02,  ..., -9.2919e-02,
          -9.6314e-02, -2.8773e-01],
         ...,
         [ 1.8572e-01,  3