<a href="https://colab.research.google.com/github/ydandy6/genAI/blob/main/chapter_2_transformer_with_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 예제 2.1 토큰화 코드

In [None]:
# 띄어쓰기 단위로 분리
input_text = "나는 최근 파리 여행을 다녀왔다. 파리가 좋았어"
input_text_list = input_text.split()
print("input_text_list: ", input_text_list)

# 토큰 -> 아이디 딕셔너리와 아이디 -> 토큰 딕셔너리 만들기
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
idx2str = {idx:word for idx, word in enumerate(input_text_list)}
print("str2idx: ", str2idx)
print("idx2str: ", idx2str)

# 토큰을 토큰 아이디로 변환
input_ids = [str2idx[word] for word in input_text_list]
print("input_ids: ", input_ids)

input_text_list:  ['나는', '최근', '파리', '여행을', '다녀왔다.', '파리가', '좋았어']
str2idx:  {'나는': 0, '최근': 1, '파리': 2, '여행을': 3, '다녀왔다.': 4, '파리가': 5, '좋았어': 6}
idx2str:  {0: '나는', 1: '최근', 2: '파리', 3: '여행을', 4: '다녀왔다.', 5: '파리가', 6: '좋았어'}
input_ids:  [0, 1, 2, 3, 4, 5, 6]


## 예제 2.2 토큰 아이디에서 벡터로 변환

In [None]:
import torch
import torch.nn as nn

embedding_dim = 16
embed_layer = nn.Embedding(len(str2idx), embedding_dim)

input_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
input_embeddings = input_embeddings.unsqueeze(0) # (1, 5, 16)
input_embeddings.shape

torch.Size([1, 7, 16])

In [None]:
print(len(str2idx))

5


## 예제 2.3 절대적 위치 인코딩

In [None]:
embedding_dim = 16
max_position = 12
# 토큰 임베딩 층 생성
embed_layer = nn.Embedding(len(str2idx), embedding_dim)
# 위치 인코딩 층 생성
position_embed_layer = nn.Embedding(max_position, embedding_dim)

position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
position_encodings = position_embed_layer(position_ids)
token_embeddings = embed_layer(torch.tensor(input_ids)) # (5, 16)
token_embeddings = token_embeddings.unsqueeze(0) # (1, 5, 16)
# 토큰 임베딩과 위치 인코딩을 더해 최종 입력 임베딩 생성
input_embeddings = token_embeddings + position_encodings
input_embeddings.shape

## 예제 2.4 쿼리, 키, 값 벡터를 만드는 nn.Linear 층

In [None]:
head_dim = 16

# 쿼리, 키, 값을 계산하기 위한 변환
weight_q = nn.Linear(embedding_dim, head_dim)
weight_k = nn.Linear(embedding_dim, head_dim)
weight_v = nn.Linear(embedding_dim, head_dim)
# 변환 수행
querys = weight_q(input_embeddings) # (1, 5, 16)
keys = weight_k(input_embeddings) # (1, 5, 16)
values = weight_v(input_embeddings) # (1, 5, 16)

## 예제 2.5. 스케일 점곱 방식의 어텐션

In [None]:
print(querys)

tensor([[[ 0.3786, -0.5800,  1.0169,  0.9738,  0.2214,  0.7925,  0.5012,
           0.1506, -0.8194, -1.3387,  0.7269, -0.1177, -0.8939,  0.8756,
          -0.6137, -0.9647],
         [ 1.0808, -0.1047, -0.0982,  0.1023,  0.1297,  0.6953,  0.7626,
          -1.0505, -0.5499, -1.2547,  0.0382, -0.4115,  0.0724,  0.1315,
          -1.4001, -0.3283],
         [-0.5575, -0.6064,  0.3807,  0.2899,  0.6409,  0.2535,  0.1195,
          -0.7423,  0.7272,  0.0894, -0.8272,  0.2651,  1.4306,  0.1532,
           0.6631,  0.7149],
         [-0.3763, -0.0448,  0.1174,  0.0965, -0.5448,  0.1974, -0.0835,
          -0.4694, -0.6586,  0.4734, -0.2839, -0.1150,  0.1532, -0.9643,
           0.0031, -0.5935],
         [-0.2383, -0.2460, -0.7402, -0.0127,  0.3045, -0.1004,  0.9954,
           0.1625,  0.2350,  0.7611, -0.6246, -0.0646, -0.1221,  0.0544,
          -0.2276,  0.8921],
         [-0.3403, -0.8502,  0.8460,  0.3944,  0.3689,  0.2789,  0.0879,
          -0.5865, -0.6478, -0.9286,  0.3739,  0.085

In [None]:
from math import sqrt
import torch.nn.functional as F

def compute_attention(querys, keys, values, is_causal=False):
	dim_k = querys.size(-1) # 16
	scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
	weights = F.softmax(scores, dim=-1)
	return weights @ values

In [None]:
print(values)

tensor([[[ 0.4591,  0.9048,  0.4076, -0.3501, -0.2559,  0.4911, -0.1154,
          -0.9263,  0.2169,  0.0663, -1.0405,  0.9713, -0.7006,  0.3283,
           1.1647,  1.7170],
         [ 0.0564,  0.0614, -0.4325, -0.6356, -0.3683,  0.4350,  0.1000,
          -0.2797, -0.3382, -0.4526,  0.4166, -0.0135, -1.0697,  0.1202,
           0.0879,  0.6486],
         [-0.4273,  0.3486,  0.3937,  0.2343,  0.1519, -0.8295,  0.2493,
           0.2235, -0.1613,  0.7933,  0.7065, -0.2206, -0.4277,  0.4562,
          -0.0240, -0.2502],
         [-0.2478, -0.1194,  0.3058, -0.5148,  0.2385, -0.4475,  0.0826,
          -0.3301, -0.4548,  0.3995,  0.7923, -0.3362, -0.8612,  0.9770,
          -0.7764, -0.9706],
         [-1.1230,  0.1400,  0.0157, -0.2467,  0.0467, -0.0793,  0.0440,
           0.8722, -0.7163,  0.5302,  0.5456, -0.0468,  0.0675, -0.5473,
          -0.2705, -0.5187],
         [ 0.1290,  0.7769,  0.1402, -1.3459,  0.2560, -0.5554, -0.8203,
          -0.0905,  0.1075,  0.0601,  0.0942,  0.357

## 예제 2.6. 어텐션 연산의 입력과 출력

In [None]:
print("원본 입력 형태: ", input_embeddings.shape)

after_attention_embeddings = compute_attention(querys, keys, values)

print("어텐션 적용 후 형태: ", after_attention_embeddings.shape)
# 원본 입력 형태:  torch.Size([1, 5, 16])
# 어텐션 적용 후 형태:  torch.Size([1, 5, 16])

원본 입력 형태:  torch.Size([1, 7, 16])
어텐션 적용 후 형태:  torch.Size([1, 7, 16])


## 예제 2.7. 어텐션 연산을 수행하는 AttentionHead 클래스

In [None]:
class AttentionHead(nn.Module):
  def __init__(self, token_embed_dim, head_dim, is_causal=False):
    super().__init__()
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, head_dim) # 쿼리 벡터 생성을 위한 선형 층
    self.weight_k = nn.Linear(token_embed_dim, head_dim) # 키 벡터 생성을 위한 선형 층
    self.weight_v = nn.Linear(token_embed_dim, head_dim) # 값 벡터 생성을 위한 선형 층

  def forward(self, querys, keys, values):
    outputs = compute_attention(
        self.weight_q(querys),  # 쿼리 벡터
        self.weight_k(keys),    # 키 벡터
        self.weight_v(values),  # 값 벡터
        is_causal=self.is_causal
    )
    return outputs

attention_head = AttentionHead(embedding_dim, embedding_dim)
after_attention_embeddings = attention_head(input_embeddings, input_embeddings, input_embeddings)

In [None]:
print(attention_head)
print(after_attention_embeddings)

AttentionHead(
  (weight_q): Linear(in_features=16, out_features=16, bias=True)
  (weight_k): Linear(in_features=16, out_features=16, bias=True)
  (weight_v): Linear(in_features=16, out_features=16, bias=True)
)
tensor([[[-0.0690,  0.2420, -0.2890,  0.0628,  0.1173, -0.0031,  0.1127,
          -0.0609, -0.0318, -0.1841,  0.2240, -0.1713,  0.2732,  0.2338,
          -0.2762, -0.0900],
         [-0.0884,  0.2936, -0.2547, -0.0983, -0.0491,  0.0967,  0.1677,
          -0.2342, -0.0785, -0.2241,  0.2499, -0.0358,  0.5087,  0.2882,
          -0.1870, -0.0426],
         [-0.0621,  0.2846, -0.2221, -0.1026, -0.0680,  0.0997,  0.1711,
          -0.2265, -0.0712, -0.2080,  0.2572, -0.0507,  0.5073,  0.3028,
          -0.1828, -0.0147],
         [-0.0755,  0.3013, -0.2008, -0.1500, -0.1463,  0.1539,  0.1797,
          -0.2812, -0.0926, -0.2159,  0.2490, -0.0067,  0.6047,  0.3327,
          -0.1395,  0.0311],
         [-0.0566,  0.3002, -0.2104, -0.1297, -0.1246,  0.1374,  0.1675,
          -0.28

## 예제 2.8. 멀티 헤드 어텐션 구현

In [None]:
class MultiheadAttention(nn.Module):
  def __init__(self, token_embed_dim, d_model, n_head, is_causal=False):
    super().__init__()
    self.n_head = n_head
    self.is_causal = is_causal
    self.weight_q = nn.Linear(token_embed_dim, d_model)
    self.weight_k = nn.Linear(token_embed_dim, d_model)
    self.weight_v = nn.Linear(token_embed_dim, d_model)
    self.concat_linear = nn.Linear(d_model, d_model)

  def forward(self, querys, keys, values):
    B, T, C = querys.size()
    querys = self.weight_q(querys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    keys = self.weight_k(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    values = self.weight_v(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
    attention = compute_attention(querys, keys, values, self.is_causal)
    output = attention.transpose(1, 2).contiguous().view(B, T, C)
    output = self.concat_linear(output)
    return output

n_head = 4
mh_attention = MultiheadAttention(embedding_dim, embedding_dim, n_head)
after_attention_embeddings = mh_attention(input_embeddings, input_embeddings, input_embeddings)
after_attention_embeddings.shape

## 예제 2.9. 층 정규화 코드

In [None]:
norm = nn.LayerNorm(embedding_dim)
norm_x = norm(input_embeddings)
norm_x.shape # torch.Size([1, 5, 16])

norm_x.mean(dim=-1).data, norm_x.std(dim=-1).data

# (tensor([[ 2.2352e-08, -1.1176e-08, -7.4506e-09, -3.9116e-08, -1.8626e-08]]),
#  tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

(tensor([[-2.9802e-08, -1.4901e-08,  0.0000e+00, -1.8626e-08, -2.2352e-08,
           0.0000e+00,  7.4506e-09]]),
 tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

## 예제 2.10. 피드 포워드 층 코드

In [None]:
class PreLayerNormFeedForward(nn.Module):
  def __init__(self, d_model, dim_feedforward, dropout):
    super().__init__()
    self.linear1 = nn.Linear(d_model, dim_feedforward) # 선형 층 1
    self.linear2 = nn.Linear(dim_feedforward, d_model) # 선형 층 2
    self.dropout1 = nn.Dropout(dropout) # 드랍아웃 층 1
    self.dropout2 = nn.Dropout(dropout) # 드랍아웃 층 2
    self.activation = nn.GELU() # 활성 함수
    self.norm = nn.LayerNorm(d_model) # 층 정규화

  def forward(self, src):
    x = self.norm(src)
    x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
    x = self.dropout2(x)
    return x

## 예제 2.11. 인코더 층

In [None]:
class TransformerEncoderLayer(nn.Module):
  def __init__(self, d_model, nhead, dim_feedforward, dropout):
    super().__init__()
    self.attn = MultiheadAttention(d_model, d_model, nhead) # 멀티 헤드 어텐션 클래스
    self.norm1 = nn.LayerNorm(d_model) # 층 정규화
    self.dropout1 = nn.Dropout(dropout) # 드랍아웃
    self.feed_forward = PreLayerNormFeedForward(d_model, dim_feedforward, dropout) # 피드포워드

  def forward(self, src):
    norm_x = self.norm1(src)
    attn_output = self.attn(norm_x, norm_x, norm_x)
    x = src + self.dropout1(attn_output) # 잔차 연결

    # 피드 포워드
    x = self.feed_forward(x)
    return x

## 예제 2.12. 인코더 구현

In [None]:
import copy
def get_clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class TransformerEncoder(nn.Module):
  def __init__(self, encoder_layer, num_layers):
    super().__init__()
    self.layers = get_clones(encoder_layer, num_layers)
    self.num_layers = num_layers
    self.norm = norm

  def forward(self, src):
    output = src
    for mod in self.layers:
        output = mod(output)
    return output

## 예제 2.13. 디코더에서 어텐션 연산(마스크 어텐션)

In [None]:
def compute_attention(querys, keys, values, is_causal=False):
	dim_k = querys.size(-1) # 16
	scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k) # (1, 5, 5)
	if is_causal:
		query_length = querys.size(2)
		key_length = keys.size(2)
		temp_mask = torch.ones(query_length, key_length, dtype=torch.bool).tril(diagonal=0)
		scores = scores.masked_fill(temp_mask == False, float("-inf"))
	weights = F.softmax(scores, dim=-1) # (1, 5, 5)
	return weights @ values # (1, 5, 16)

## 예제 2.14. 크로스 어텐션이 포함된 디코더 층

In [None]:
class TransformerDecoderLayer(nn.Module):
  def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
    super().__init__()
    self.self_attn = MultiheadAttention(d_model, d_model, nhead)
    self.multihead_attn = MultiheadAttention(d_model, d_model, nhead)
    self.feed_forward = PreLayerNormFeedForward(d_model, dim_feedforward, dropout)

    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)

  def forward(self, tgt, encoder_output, is_causal=True):
    # 셀프 어텐션 연산
    x = self.norm1(tgt)
    x = x + self.dropout1(self.self_attn(x, x, x, is_causal=is_causal))
    # 크로스 어텐션 연산
    x = self.norm2(x)
    x = x + self.dropout2(self.multihead_attn(x, encoder_output, encoder_output))
    # 피드 포워드 연산
    x = self.feed_forward(x)
    return x

## 예제 2.15. 디코더 구현

In [None]:
import copy
def get_clones(module, N):
  return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

class TransformerDecoder(nn.Module):
  def __init__(self, decoder_layer, num_layers):
    super().__init__()
    self.layers = get_clones(decoder_layer, num_layers)
    self.num_layers = num_layers

  def forward(self, tgt, src):
    output = tgt
    for mod in self.layers:
        output = mod(output, src)
    return output