##**7. Multi-head Attention**
1. Multi-head attention 및 self-attention 구현.
2. 각 과정에서 일어나는 연산과 input/output 형태 이해.

### **필요 패키지 import**

In [1]:
from torch import nn
from torch.nn import functional as F
from tqdm.auto import tqdm

import torch
import math

### **데이터 전처리**

In [2]:
pad_id = 0
vocab_size = 100

data = [
  [62, 13, 47, 39, 78, 33, 56, 13, 39, 29, 44, 86, 71, 36, 18, 75],
  [60, 96, 51, 32, 90],
  [35, 45, 48, 65, 91, 99, 92, 10, 3, 21, 54],
  [75, 51],
  [66, 88, 98, 47],
  [21, 39, 10, 64, 21],
  [98],
  [77, 65, 51, 77, 19, 15, 35, 19, 23, 97, 50, 46, 53, 42, 45, 91, 66, 3, 43, 10],
  [70, 64, 98, 25, 99, 53, 4, 13, 69, 62, 66, 76, 15, 75, 45, 34],
  [20, 64, 81, 35, 76, 85, 1, 62, 8, 45, 99, 77, 19, 43]
]

In [3]:
def padding(data):

    max_len = len(max(data, key = len))

    print(f"Maximum sequence length: {max_len}")

    for i,seq in enumerate(tqdm(data)):

        if len(seq) < max_len:

            data[i] = seq + [pad_id] * (max_len - len(seq))

    return data, max_len

In [4]:
data,max_len = padding(data)

Maximum sequence length: 20


  0%|          | 0/10 [00:00<?, ?it/s]

In [6]:
data

[[62, 13, 47, 39, 78, 33, 56, 13, 39, 29, 44, 86, 71, 36, 18, 75, 0, 0, 0, 0],
 [60, 96, 51, 32, 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [35, 45, 48, 65, 91, 99, 92, 10, 3, 21, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [75, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [66, 88, 98, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [21, 39, 10, 64, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [98, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [77,
  65,
  51,
  77,
  19,
  15,
  35,
  19,
  23,
  97,
  50,
  46,
  53,
  42,
  45,
  91,
  66,
  3,
  43,
  10],
 [70, 64, 98, 25, 99, 53, 4, 13, 69, 62, 66, 76, 15, 75, 45, 34, 0, 0, 0, 0],
 [20, 64, 81, 35, 76, 85, 1, 62, 8, 45, 99, 77, 19, 43, 0, 0, 0, 0, 0, 0]]

### **Hyperparameter 세팅 및 embedding**

In [7]:
d_model = 512 # model의 hidden size
num_heads = 8 #head의 개수

In [8]:
embedding = nn.Embedding(vocab_size, d_model)

# B: batch size, L: maximum sequence length
batch = torch.LongTensor(data) #(B,L)
batch_emb = embedding(batch) #(B, L, d_model)

In [9]:
print(batch_emb)
print(batch_emb.shape)

tensor([[[-0.8534,  1.7007, -1.2924,  ...,  0.9932,  0.3819, -0.6464],
         [ 0.2977, -0.0166,  0.5648,  ...,  0.9012, -1.2296, -0.5126],
         [-1.3034, -1.3391,  0.0420,  ...,  0.0115,  0.4022,  0.4236],
         ...,
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594],
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594],
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594]],

        [[ 0.9465,  0.7638,  0.0633,  ...,  0.2461,  0.1262,  0.6830],
         [-0.2418, -0.5267,  0.1105,  ...,  0.1857, -1.3234, -0.6660],
         [ 0.0977,  1.2537,  1.1130,  ..., -1.2878,  0.0729,  0.8466],
         ...,
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594],
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594],
         [-0.9489,  0.4054, -0.8236,  ...,  0.5951,  1.4274, -0.0594]],

        [[-0.2536, -0.0621,  0.6154,  ..., -1.1294,  1.2604, -0.5828],
         [ 0.6329,  0.1130,  1.4359,  ...,  0

### **Linear transformation & 여러 head로 나누기**

Multi-head attention 내에서 쓰이는 linear transformation matrix들을 정의합니다.

In [15]:
w_q = nn.Linear(d_model, d_model)
w_k = nn.Linear(d_model, d_model)
w_v = nn.Linear(d_model, d_model)
w_o = nn.Linear(d_model, d_model)

In [16]:
q = w_q(batch_emb) #(B, L, d_model)
k = w_k(batch_emb) #(B, L, d_model)
v = w_v(batch_emb) #(B, L, d_model)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 20, 512])
torch.Size([10, 20, 512])
torch.Size([10, 20, 512])


Q, K, V를 `num_head`개의 차원 분할된 여러 vector로 만듭니다.

In [17]:
batch_size = q.shape[0]
d_k = d_model //num_heads

q = q.view(batch_size, -1, num_heads, d_k) #(B, L, num_heads, d_k)
k = k.view(batch_size, -1, num_heads, d_k) #(B, L, num_heads, d_k)
v = v.view(batch_size, -1, num_heads, d_k) #(B, L, num_heads, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])
torch.Size([10, 20, 8, 64])


In [18]:
q = q.transpose(1, 2) # (B, num_heads, L, d_k)
k = k.transpose(1, 2) # (B, num_heads, L, d_k)
v = v.transpose(1, 2) # (B, num_heads, L, d_k)

print(q.shape)
print(k.shape)
print(v.shape)

torch.Size([10, 8, 20, 64])
torch.Size([10, 8, 20, 64])
torch.Size([10, 8, 20, 64])


### **Scaled dot-product self-attention 구현**

각 head에서 실행되는 self-attetion 과정입니다.

In [20]:
attn_scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(d_k) #(B, num_heads, L, L)
attn_dists = F.softmax(attn_scores, dim = -1) #(B, num_heads, L, L) #(b, num_heads, L, L)

print(attn_dists)
print(attn_dists.shape)

tensor([[[[0.0381, 0.0541, 0.0620,  ..., 0.0434, 0.0434, 0.0434],
          [0.0612, 0.0668, 0.0313,  ..., 0.0384, 0.0384, 0.0384],
          [0.0635, 0.0322, 0.0611,  ..., 0.0588, 0.0588, 0.0588],
          ...,
          [0.0317, 0.0573, 0.0624,  ..., 0.0416, 0.0416, 0.0416],
          [0.0317, 0.0573, 0.0624,  ..., 0.0416, 0.0416, 0.0416],
          [0.0317, 0.0573, 0.0624,  ..., 0.0416, 0.0416, 0.0416]],

         [[0.0330, 0.0364, 0.0590,  ..., 0.0450, 0.0450, 0.0450],
          [0.0260, 0.0518, 0.0369,  ..., 0.0340, 0.0340, 0.0340],
          [0.0525, 0.0380, 0.0289,  ..., 0.0676, 0.0676, 0.0676],
          ...,
          [0.0748, 0.0512, 0.0390,  ..., 0.0323, 0.0323, 0.0323],
          [0.0748, 0.0512, 0.0390,  ..., 0.0323, 0.0323, 0.0323],
          [0.0748, 0.0512, 0.0390,  ..., 0.0323, 0.0323, 0.0323]],

         [[0.0491, 0.0211, 0.0622,  ..., 0.0695, 0.0695, 0.0695],
          [0.0331, 0.0516, 0.0401,  ..., 0.0679, 0.0679, 0.0679],
          [0.0611, 0.0566, 0.0793,  ..., 0

In [21]:
attn_values = torch.matmul(attn_dists, v) #(B, num_heads, L, d_k)

print(attn_values.shape)

torch.Size([10, 8, 20, 64])


### **각 head의 결과물 병합**

각 head의 결과물을 concat하고 동일 차원으로 linear transformation합니다.

In [22]:
attn_values = attn_values.transpose(1, 2) #(B, L, num_heads, d_k)
attn_values = attn_values.contiguous().view(batch_size, -1, d_model) #(B, L, d_model)

print(attn_values.shape)

torch.Size([10, 20, 512])


In [23]:
outputs = w_o(attn_values)

print(outputs)
print(outputs.shape)

tensor([[[ 1.0522e-01,  2.9473e-01,  8.8185e-02,  ..., -2.6664e-01,
           1.2772e-01, -2.1247e-01],
         [ 8.9886e-02,  2.3670e-01,  7.7320e-02,  ..., -2.0706e-01,
           1.1879e-01, -1.9448e-01],
         [ 1.5877e-01,  2.5747e-01,  2.8416e-02,  ..., -2.0461e-01,
           1.1436e-01, -1.7485e-01],
         ...,
         [ 1.1115e-01,  1.7071e-01,  7.2013e-02,  ..., -2.5760e-01,
           9.6523e-02, -2.2751e-01],
         [ 1.1115e-01,  1.7071e-01,  7.2013e-02,  ..., -2.5760e-01,
           9.6523e-02, -2.2751e-01],
         [ 1.1115e-01,  1.7071e-01,  7.2013e-02,  ..., -2.5760e-01,
           9.6523e-02, -2.2751e-01]],

        [[ 2.8415e-01,  4.9839e-01,  8.1970e-02,  ...,  1.4600e-01,
           2.1596e-01,  3.9828e-04],
         [ 2.1643e-01,  4.2862e-01,  2.9722e-02,  ...,  7.5932e-02,
           1.9498e-01,  1.5535e-02],
         [ 2.3110e-01,  4.6421e-01,  6.1392e-04,  ...,  7.3851e-02,
           1.6699e-01, -7.7949e-02],
         ...,
         [ 2.2830e-01,  4

### **전체 코드**

위의 과정을 모두 합쳐 하나의 Multi-head attention 모듈을 구현하겠습니다.

In [24]:
class MultiheadAttention(nn.Module):

    def __init__(self):
        super(MultiheadAttention, self).__init__()

        #Q, K, V learnable matrices
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        #Linear transformation for concatenated outputs
        self.w_o = nn.Linear(d_model, d_model)

    def self_attention(self, q, k, v):

        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k) # (B, num_heads, L, L)
        attn_dists = F.softmax(attn_scores, dim = -1) # (B, num_heads, L, L)

        attn_values = torch.matmul(attn_dists, v) # (B, num_heads, L, d_k)

        return attn_values


    def forward(self, q, k, v):

        batch_size = q.shape[0]

        q = self.w_q(q) # (B, L, d_model)
        k = self.w_k(k) # (B, L, d_model)
        v = self.w_v(v) # (B, L, d_model)

        #split into num_heads
        q = q.view(batch_size, -1, num_heads, d_k) # (B, L, num_heads, d_k)
        k = k.view(batch_size, -1, num_heads, d_k) # (B, L, num_heads, d_k)
        v = v.view(batch_size, -1, num_heads, d_k) # (B, L, num_heads, d_k)

        q = q.transpose(1, 2) # (B, num_heads, L, d_k)
        k = k.transpose(1, 2) # (B, num_heads, L, d_k)
        v = v.transpose(1, 2) # (B, num_heads, L, d_k)

        attn_values = self.self_attention(q, k, v) # (B, num_heads, L, d_k)
        attn_values = attn_values.transpose(1, 2).contiguous().view(batch_size, -1, d_model) # (B, L, num_heads, d_k) => (B, L, d_model)

        return self.w_o(attn_values)


In [25]:
multihead_attn = MultiheadAttention()

outputs = multihead_attn(batch_emb, batch_emb, batch_emb) # (B, L, d_model)

In [26]:
print(outputs)
print(outputs.shape)

tensor([[[ 0.0645, -0.0619, -0.0445,  ...,  0.0419,  0.0232,  0.2504],
         [ 0.1557, -0.0769, -0.0327,  ...,  0.0721,  0.0847,  0.1345],
         [ 0.0980, -0.0377, -0.0379,  ...,  0.0770,  0.0285,  0.1623],
         ...,
         [ 0.0915, -0.0602, -0.0503,  ...,  0.0725,  0.0727,  0.1785],
         [ 0.0915, -0.0602, -0.0503,  ...,  0.0725,  0.0727,  0.1785],
         [ 0.0915, -0.0602, -0.0503,  ...,  0.0725,  0.0727,  0.1785]],

        [[ 0.0536, -0.1680, -0.1299,  ...,  0.0798,  0.0284,  0.3093],
         [ 0.1877, -0.2128, -0.1683,  ...,  0.0924,  0.0506,  0.3370],
         [ 0.1306, -0.2158, -0.1210,  ...,  0.1101,  0.0632,  0.3397],
         ...,
         [ 0.1318, -0.2429, -0.0956,  ...,  0.0957,  0.0865,  0.3430],
         [ 0.1318, -0.2429, -0.0956,  ...,  0.0957,  0.0865,  0.3430],
         [ 0.1318, -0.2429, -0.0956,  ...,  0.0957,  0.0865,  0.3430]],

        [[ 0.0850, -0.1528, -0.1028,  ..., -0.1613,  0.0125,  0.1932],
         [-0.0011, -0.1294, -0.1337,  ..., -0