# Transformer

Transformer의 핵심 구조인 Multi-head Attention을 구현하는 실습입니다.
1. Multi-head attention 및 self-attention 구현.
2. 각 과정에서 일어나는 연산과 input/output 형태 이해.

### 필요 패키지 import

In [1]:
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

import torch
import math

## Req. 2-1 Multi-head self-attention 구조 익히기

### **데이터 전처리**
vocab_size 100인 가상의 시퀀스 데이터를 생성합니다. 

각 데이터에 할당된 숫자는 tokenizing과 정수화가 이뤄진 형태입니다.

In [2]:
pad_id = 0
vocab_size = 100

data = [
  [62, 13, 47, 39, 78, 33, 56, 13, 39, 29, 44, 86, 71, 36, 18, 75],
  [60, 96, 51, 32, 90],
  [35, 45, 48, 65, 91, 99, 92, 10, 3, 21, 54],
  [75, 51],
  [66, 88, 98, 47],
  [21, 39, 10, 64, 21],
  [98],
  [77, 65, 51, 77, 19, 15, 35, 19, 23, 97, 50, 46, 53, 42, 45, 91, 66, 3, 43, 10],
  [70, 64, 98, 25, 99, 53, 4, 13, 69, 62, 66, 76, 15, 75, 45, 34],
  [20, 64, 81, 35, 76, 85, 1, 62, 8, 45, 99, 77, 19, 43]
]

In [3]:
# 길이 맞춰주기 위해 패딩합니다.
def padding(data):
  max_len = len(max(data, key=len))
  print(f"Maximum sequence length: {max_len}")

  for i, seq in enumerate(tqdm(data)):
    if len(seq) < max_len:
      data[i] = seq + [pad_id] * (max_len - len(seq))

  return data, max_len

In [4]:
data, max_len = padding(data)

Maximum sequence length: 20


100%|██████████| 10/10 [00:00<00:00, 55924.05it/s]


In [5]:
data

[[62, 13, 47, 39, 78, 33, 56, 13, 39, 29, 44, 86, 71, 36, 18, 75, 0, 0, 0, 0],
 [60, 96, 51, 32, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [35, 45, 48, 65, 91, 99, 92, 10, 3, 21, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [75, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [66, 88, 98, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [21, 39, 10, 64, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [77,
  65,
  51,
  77,
  19,
  15,
  35,
  19,
  23,
  97,
  50,
  46,
  53,
  42,
  45,
  91,
  66,
  3,
  43,
  10],
 [70, 64, 98, 25, 99, 53, 4, 13, 69, 62, 66, 76, 15, 75, 45, 34, 0, 0, 0, 0],
 [20, 64, 81, 35, 76, 85, 1, 62, 8, 45, 99, 77, 19, 43, 0, 0, 0, 0, 0, 0]]

### Hyperparameter 세팅 및 embedding

In [6]:
d_model = 512  # model의 hidden size
num_heads = 8  # head의 개수

# d_model이 입력을 projection 시킬 임베딩 space의 차원이므로, num_heads로 나누어 떨어져야 한다.

In [7]:
embedding = nn.Embedding(vocab_size, d_model)

# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data)  # (B, L)
batch_emb = embedding(batch)  # (B, L, d_model)

In [8]:
print(batch_emb)
print(batch_emb.shape)

tensor([[[ 0.0227, -0.7169, -1.0666,  ..., -0.0915, -2.4909,  0.6048],
         [-0.0783, -0.1332,  0.2458,  ...,  0.9972,  0.4791, -0.1637],
         [ 0.1512,  1.6798,  0.1060,  ...,  2.1160,  0.1851, -0.0871],
         ...,
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500],
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500],
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500]],

        [[-0.5863,  0.5547, -1.3158,  ..., -1.4122, -0.7298, -0.5086],
         [ 0.1425,  1.7762, -0.1879,  ..., -0.2370, -0.6788, -1.2901],
         [ 0.3872,  1.2684, -0.4120,  ...,  2.0639, -0.2172, -0.9816],
         ...,
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500],
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500],
         [-1.3291, -0.6499,  1.4460,  ..., -0.6595, -1.7504,  0.1500]],

        [[-1.1946, -0.2537, -1.5862,  ...,  0.1333, -0.1476,  0.0090],
         [ 0.3754,  0.9026, -0.3693,  ...,  1

### Linear projection & 여러 head로 나누기

Multi-head attention 내에서 쓰이는 linear projection matrix들을 정의합니다.

In [9]:
w_q = nn.Linear(d_model, d_model)
w_k = nn.Linear(d_model, d_model)
w_v = nn.Linear(d_model, d_model)

In [10]:
w_0 = nn.Linear(d_model, d_model)

In [11]:
q = w_q(batch_emb)  # (B, L, d_model)
k = w_k(batch_emb)  # (B, L, d_model)
v = w_v(batch_emb)  # (B, L, d_model)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 20, 512])
torch.Size([10, 20, 512])
torch.Size([10, 20, 512])


Q, k, v를 `num_head`개의 차원 분할된 여러 vector로 만듭니다.

- 이론적으로는 multi-head attention을 수행하면 input을 각각 다른 head 개수만큼의 Wq, Wk, Wv로 linear transformation 해서 각각 여러번의 attention 수행한 후 concat 한 후 linear transformation 수행해준다
- 구현에서는 Wq, Wk, Wv 한 개씩
- 실제 `attention is all you need` 논문의 구현 예시는 Query vector 한개를 dim으로 쪼개서 진행한다

In [12]:
batch_size = q.shape[0]
d_k = d_model // num_heads

# num_heads * d_k로 쪼갠다
q = q.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)
k = k.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)
v = v.view(batch_size, -1, num_heads, d_k)  # (B, L, num_heads, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])


In [13]:
# num_heads를 밖으로 뺌으로써
# 각 head가 (L, d_k) 만큼의 matrix를 가지고 self-attention 수행

q = q.transpose(1, 2)  # (B, num_heads, L, d_k)
k = k.transpose(1, 2)  # (B, num_heads, L, d_k)
v = v.transpose(1, 2)  # (B, num_heads, L, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 8, 20, 64])
torch.Size([10, 8, 20, 64])
torch.Size([10, 8, 20, 64])


### Scaled dot-product self-attention 구현

각 head에서 실행되는 self-attetion 과정입니다.

In [14]:
# shape - (L, L)
# 같은 sequence 내에 서로 다른 token들에게 얼마나 가중치를 두고 attention을 해야하는가
attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)  # (B, num_heads, L, L)
# softmax - row-wise이기 때문에 dim은 -1
attn_dists = F.softmax(attn_scores, dim=-1)  # (B, num_heads, L, L)

print(attn_dists)
print(attn_dists.shape)

tensor([[[[0.0308, 0.0823, 0.0586,  ..., 0.0437, 0.0437, 0.0437],
          [0.0589, 0.0795, 0.0593,  ..., 0.0325, 0.0325, 0.0325],
          [0.0417, 0.0448, 0.0348,  ..., 0.0507, 0.0507, 0.0507],
          ...,
          [0.0412, 0.0508, 0.0277,  ..., 0.0433, 0.0433, 0.0433],
          [0.0412, 0.0508, 0.0277,  ..., 0.0433, 0.0433, 0.0433],
          [0.0412, 0.0508, 0.0277,  ..., 0.0433, 0.0433, 0.0433]],

         [[0.0502, 0.0701, 0.0431,  ..., 0.0451, 0.0451, 0.0451],
          [0.0365, 0.0530, 0.0504,  ..., 0.0483, 0.0483, 0.0483],
          [0.0667, 0.0509, 0.0410,  ..., 0.0611, 0.0611, 0.0611],
          ...,
          [0.0341, 0.0499, 0.0515,  ..., 0.0365, 0.0365, 0.0365],
          [0.0341, 0.0499, 0.0515,  ..., 0.0365, 0.0365, 0.0365],
          [0.0341, 0.0499, 0.0515,  ..., 0.0365, 0.0365, 0.0365]],

         [[0.0909, 0.0403, 0.0583,  ..., 0.0514, 0.0514, 0.0514],
          [0.0405, 0.0422, 0.0721,  ..., 0.0436, 0.0436, 0.0436],
          [0.0615, 0.0378, 0.0598,  ..., 0

In [15]:
attn_values = torch.matmul(attn_dists, v)  # (B, num_heads, L, d_k)

print(attn_values.shape)

torch.Size([10, 8, 20, 64])


### 각 head의 결과물 병합

각 head의 결과물을 concat하고 동일 차원으로 linear projection합니다.

In [16]:
attn_values = attn_values.transpose(1, 2)  # (B, L, num_heads, d_k)
attn_values = attn_values.contiguous().view(batch_size, -1, d_model)  # (B, L, d_model)

print(attn_values.shape)

torch.Size([10, 20, 512])


In [17]:
# w_0 : (d_model, d_model)
# 서로 다른 의미로 foucsing 된 각 head의 self-attention 정보들을 합쳐주는 역할 수행
outputs = w_0(attn_values)

print(outputs)
print(outputs.shape)

tensor([[[ 0.1001, -0.0317,  0.0495,  ..., -0.0762,  0.0256, -0.0998],
         [ 0.1160, -0.0281,  0.0075,  ..., -0.0090,  0.0475, -0.0746],
         [ 0.0866, -0.0017,  0.0315,  ...,  0.0158,  0.0203, -0.1333],
         ...,
         [ 0.1087,  0.0687,  0.0247,  ..., -0.0050,  0.0923, -0.0934],
         [ 0.1087,  0.0687,  0.0247,  ..., -0.0050,  0.0923, -0.0934],
         [ 0.1087,  0.0687,  0.0247,  ..., -0.0050,  0.0923, -0.0934]],

        [[-0.0061,  0.0622,  0.1436,  ..., -0.0452, -0.2533, -0.3990],
         [ 0.0024,  0.0518,  0.1457,  ...,  0.0029, -0.2689, -0.4206],
         [ 0.0319,  0.1129,  0.1208,  ..., -0.0017, -0.1423, -0.3227],
         ...,
         [ 0.0271,  0.1041,  0.1683,  ...,  0.0197, -0.1913, -0.3267],
         [ 0.0271,  0.1041,  0.1683,  ...,  0.0197, -0.1913, -0.3267],
         [ 0.0271,  0.1041,  0.1683,  ...,  0.0197, -0.1913, -0.3267]],

        [[-0.0612, -0.0607,  0.1817,  ..., -0.0835, -0.1291, -0.2943],
         [-0.0128, -0.0667,  0.1512,  ..., -0

## Req. 2-2 Multi-head self-attention 모듈 클래스 구현

위의 과정을 모두 합쳐 하나의 Multi-head attention 모듈 class를 구현하겠습니다.

아래 코드의 TODO 부분을 채워주세요.

In [27]:
class MultiheadAttention(nn.Module):
  def __init__(self):
    super(MultiheadAttention, self).__init__()

    # Q, K, V learnable matrices
    self.w_q = nn.Linear(d_model, d_model)
    self.w_k = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)

    # Linear projection for concatenated outputs
    self.w_0 = nn.Linear(d_model, d_model)

  # scaled-dot product attention
  def self_attention(self, q, k, v):
    attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)  # (B, num_heads, L, L)
    attn_dists = F.softmax(attn_scores, dim=-1)  # (B, num_heads, L, L)

    attn_values = torch.matmul(attn_dists, v)  # (B, num_heads, L, d_k)

    return attn_values

  def forward(self, q, k, v):
    batch_size = q.shape[0]

    # linear projection
    ################################################################################
    # TODO 1: Implement the forward pass for linear projection.                #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)

    print(q.shape)
    print(k.shape)
    print(v.shape)

    # head만큼 쪼개준다
    ################################################################################
    # TODO 2: Implement the forward pass for split head.                #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    q = q.view(batch_size, -1, num_heads, d_k)
    k = k.view(batch_size, -1, num_heads, d_k)
    v = v.view(batch_size, -1, num_heads, d_k)

    print(q.shape)
    print(k.shape)
    print(v.shape)

    # 각 head가 (L, d_k)의 matrix를 담당하도록 만든다
    q = q.transpose(1, 2)  # (B, num_heads, L, d_k)
    k = k.transpose(1, 2)  # (B, num_heads, L, d_k)
    v = v.transpose(1, 2)  # (B, num_heads, L, d_k)

    attn_values = self.self_attention(q, k, v)  # (B, num_heads, L, d_k)
    attn_values = attn_values.transpose(1, 2).contiguous().view(batch_size, -1, d_model)  # (B, L, num_heads, d_k) => (B, L, d_model)

    return self.w_0(attn_values)

In [28]:
multihead_attn = MultiheadAttention()

outputs = multihead_attn(batch_emb, batch_emb, batch_emb)  # (B, L, d_model)

torch.Size([10, 20, 512])
torch.Size([10, 20, 512])
torch.Size([10, 20, 512])
torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])


In [29]:
print(outputs)
print(outputs.shape)  # (batch_size, length, d_model)

tensor([[[-0.1155,  0.0739,  0.0743,  ..., -0.0894, -0.0701, -0.0931],
         [-0.1487,  0.1092,  0.0011,  ..., -0.0477, -0.0676, -0.0509],
         [-0.1373,  0.1096,  0.0160,  ..., -0.0412, -0.0627, -0.0777],
         ...,
         [-0.0853,  0.0809,  0.0524,  ..., -0.0196, -0.0607, -0.1230],
         [-0.0853,  0.0809,  0.0524,  ..., -0.0196, -0.0607, -0.1230],
         [-0.0853,  0.0809,  0.0524,  ..., -0.0196, -0.0607, -0.1230]],

        [[ 0.1945,  0.3697,  0.2906,  ...,  0.3264,  0.0169, -0.4615],
         [ 0.1712,  0.4035,  0.2827,  ...,  0.3155,  0.0780, -0.4422],
         [ 0.1881,  0.4075,  0.2655,  ...,  0.2364,  0.0469, -0.4114],
         ...,
         [ 0.1928,  0.3302,  0.3160,  ...,  0.3036,  0.0672, -0.4063],
         [ 0.1928,  0.3302,  0.3160,  ...,  0.3036,  0.0672, -0.4063],
         [ 0.1928,  0.3302,  0.3160,  ...,  0.3036,  0.0672, -0.4063]],

        [[ 0.0447,  0.1728,  0.1319,  ...,  0.2321, -0.1352, -0.3282],
         [ 0.0408,  0.2158,  0.1147,  ...,  0