## Transformer 아키텍처 구현 실습

#### 0. prelims

In [4]:
# numpy 버전 다운그레이드
!pip install "numpy<2" --user
!pip install torch

Collecting torch
  Downloading torch-2.5.0-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.5.0-cp311-cp311-win_amd64.whl (203.1 MB)
   ---------------------------------------- 0.0/203.1 MB ? eta -:--:--
   ---------------------------------------- 0.2/203.1 MB 5.0 MB/s eta 0:00:41
   ---------------------------------------- 0.4/203.1 MB 6.3 MB/s eta 0:00:32
   ---------------------------------------- 0.7/203.1 MB 6.2 MB/s eta 0:00:33
   ---------------------------------------- 1.0/203.1 MB 7.1 MB/s eta 0:00:29
   ---------------------------------------- 1.4/203.1 MB 7.9 MB/s eta 0:00:26
   ---------------------------------------- 1.7/203.1 MB 7.9 MB/s eta 0:00:26
   ---------------------------------------- 2.1/203.1 MB 8.3 MB/s eta 0:00:25
   ---------------------------------------- 2.5/203.1 MB 8.7 MB/s eta 0:00:24
    --------------------------------------- 2.8/203.1 MB 8.

In [3]:
# numpy 버전 확인
print(np.__version__)

1.26.4


In [8]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context='talk')
%matplotlib inline

#### 1. Standard Encoder-decoder model

In [None]:
class EncoderDecoder(nn.Module):
    """
    A Standard Encoder-Decoder architecture. Base for this and many
    other models.
    """
    def __init__(self, encoder, decoder, input_embed, target_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.input_embed = input_embed
        self.target_embed = target_embed
        self.generator = generator
        
    def forward(self, input, target, input_mask, target_mask):
        'Take in and process masked input and target in sequences'
        return self.decode(self.encode(input, input_mask), input_mask, target, target_mask)
    
    def encode(self, input, input_mask):
        return self.encoder(self.input_embed(input), input_mask)
    
    def decode(self, memory, target, input_mask, target_mask):
        return self.decoder(self.target_embed(target), memory, input_mask, target_mask)

In [None]:
class Generator(nn.Module):
    'Define standard linear + softmax generation step'
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
        
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

### 2. Encoder and Decoder Stacks

In [None]:
def clones(layer, N):
    "Produce N identical layers"
    return nn.ModuleList([copy.deepcopy(layer) for _ in range(N)])

In [None]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features)) # features는 실제 벡터 값들의 길이를 말함. 흔히 말하는 벡터의 차원
        self.b_2 = nn.Parameter(torch.zeros(features)) # nn.Parameter로 감싸줘야 이 값들이 업데이트됨.
        self.eps = eps # std가 0이 되는 것을 방지하기 위한 아주 작은 숫자.

    def forward(self, x):
        mean = x.mean(-1, keepdim=True) # -1이란 마지막 차원을 의미
        std = x.std(-1, keepdim=True) # keepdim 텐서의 차원 구조가 변하는 것을 방지
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 # 정규화 계산. a_2는 스케일링 파라미터 / b_2는 시프트 파라미터

In [None]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N) # layers는 6개의 레이어가 iterable하게 생성된 것. 이후 for 문에서 사용됨
        self.norm = LayerNorm(layer.size) # LayerNorm 방식의 정규화
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask) # 여기 layer에 attention과 feed-forward network가 들어감.
        return self.norm(x) # layer 6개를 모두 통과한 x(텐서)가 norm(x)를 통해 정규화됨.