In [1]:
import math
import numpy as np
from copy import copy
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn

### 트랜스포머 구현을 위한 변수 설정

In [2]:
B = 64      # 배치 사이즈
M = 10      # 토큰의 최대 길이
V = 1024    # 토큰의 개수
N = 8       # 멀티헤드 개수
H = 512     # 토큰의 임베딩 사이즈
EXP = 2048  # 확장 사이즈 (FeedForward 클래스 참고)
L = 6       # 인코더/디코더 레이어 개수

### 인코더 클래스 정의

In [4]:
class Encoder(nn.Module):
    def __init__(self, n_layers):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.embedding = Embedding(V, H)
        self.layers = [EncoderLayer(H) for i in range(n_layers)]
    
    def forward(self, x):
        '''
        data = np.random.randint(0, V, (B, M))
        x = torch.from_numpy(data)
        m = Encoder(L)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        return x

### 임베딩 클래스 정의하기

In [5]:
class Embedding(nn.Module):
    def __init__(self, n_vocab, hidden_size):
        super(Embedding, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(n_vocab, hidden_size)
        
    def forward(self, x):
        '''
        data = np.random.randint(0, V, (B, M))
        x = torch.from_numpy(data)
        m = Embedding(V, H)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        return self.embedding(x)

In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, hidden_size):
        super(EncoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(N, hidden_size)
        self.feedforward = FeedForward(hidden_size, EXP)
        
    def forward(self, x):
        '''
        x = torch.rand((B, M, H))
        m = EncoderLayer(H)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.self_attention(x, x, x)
        x = self.feedforward(x)
        return x

### 멀티헤드 어텐션 클래스 정의하기

In [10]:
def attention(query, key, value):
    scale = query.shape[-1]
    score = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(scale)
    prob = F.softmax(score, dim=-1)
    attn = torch.matmul(prob, value)
    return attn

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_head, hidden_size):
        super(MultiHeadAttention, self).__init__()
        self.num_head = num_head
        self.dk = hidden_size // self.num_head
    
    def forward(self, query, key, value):
        '''
        x = torch.rand((B, M, H))
        m = MultiHeadAttention(N, H)
        v = m(x, x, x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        n_batch = query.shape[0]
        query = query.view(n_batch, -1, self.num_head, self.dk).transpose(1, 2)
        key = key.view(n_batch, -1, self.num_head, self.dk).transpose(1, 2)
        value = value.view(n_batch, -1, self.num_head, self.dk).transpose(1, 2)
        
        x = attention(query, key, value)
        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.dk * self.num_head)
        return x

In [11]:
class FeedForward(nn.Module):
    def __init__(self, hidden_size, expand_size):
        super(FeedForward, self).__init__()
        self.linear_1 = nn.Linear(hidden_size, expand_size)
        self.linear_2 = nn.Linear(expand_size, hidden_size)
        
    def forward(self, x):
        '''
        x = torch.rand((B, M, H))
        m = FeedForward(H, EXP)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.linear_1(x)
        x = self.linear_2(x)
        return x

In [13]:
x = torch.rand((B, M, H))
m = MultiHeadAttention(N, H)
v = m(x, x, x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [14]:
x = torch.rand((B, M, H))
m = FeedForward(H, EXP)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [12]:
x = torch.rand((B, M, H))
m = EncoderLayer(H)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [15]:
data = np.random.randint(0, V, (B, M))
x = torch.from_numpy(data)
m = Embedding(V, H)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [16]:
data = np.random.randint(0, V, (B, M))
x = torch.from_numpy(data)
m = Encoder(L)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

### 디코더 클래스 정의하기

In [17]:
class DecoderLayer(nn.Module):
    def __init__(self, n_head, hidden_size):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(n_head, hidden_size)
        self.encdec_attention = MultiHeadAttention(n_head, hidden_size)
        self.feedforward = FeedForward(hidden_size, 2048)
        
    def forward(self, x, memory):
        '''
        x = torch.rand((B, M, H))
        mem = copy(x)
        m = DecoderLayer(N, H)
        v = m(x, mem)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.self_attention(x, memory, memory)
        return x

In [18]:
class Decoder(nn.Module):
    def __init__(self, n_layers):
        super(Decoder, self).__init__()
        self.embedding = Embedding(V, H)
        self.layers = [DecoderLayer(N, H) for i in range(n_layers)]
        
    def forward(self, x, memory):
        '''
        data = np.random.randint(0, V, (B, M))
        x = torch.from_numpy(data)
        mem = torch.rand((B, M, H))
        m = Decoder(L)
        v = m(x, mem)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x, memory)
        return x

In [19]:
x = torch.rand((B, M, H))
mem = copy(x)
m = DecoderLayer(N, H)
v = m(x, mem)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [20]:
data = np.random.randint(0, V, (B, M))
x = torch.from_numpy(data)
mem = torch.rand((B, M, H))
m = Decoder(L)
v = m(x, mem)
v.shape  # 

torch.Size([64, 10, 512])

### 포지셔널 인코딩 클래스 정의하기

In [21]:
class PositionalEncoding(nn.Module):
    def __init__(self, hidden_size):
        super(PositionalEncoding, self).__init__()
        pos_encoding = torch.zeros(M, hidden_size)
        position = torch.arange(0, M).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_size, 2) *
                             -(math.log(10000.0) / hidden_size))
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        self.pos_encoding = pos_encoding.unsqueeze(0)
        
    def forward(self, x):
        '''
        x = torch.rand((B, M, H))
        m = PositionalEncoding(H)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = x + Variable(self.pos_encoding[:, :x.size(1)], 
                         requires_grad=False)
        return x

In [22]:
x = torch.rand((B, M, H))
m = PositionalEncoding(H)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

### 인코더와 디코더 클래스에 포지셔널 인코딩 추가하기

In [23]:
class Encoder(nn.Module):
    def __init__(self, n_layers):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.embedding = Embedding(V, H)
        self.position = PositionalEncoding(H)
        self.layers = [EncoderLayer(H) for i in range(n_layers)]
    
    def forward(self, x):
        '''
        data = np.random.randint(0, V, (B, M))
        x = torch.from_numpy(data)
        m = Encoder(L)
        v = m(x)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.embedding(x)
        x = self.position(x)
        for layer in self.layers:
            x = layer(x)
        return x

In [24]:
class Decoder(nn.Module):
    def __init__(self, n_layers):
        super(Decoder, self).__init__()
        self.embedding = Embedding(V, H)
        self.position = PositionalEncoding(H)
        self.layers = [DecoderLayer(N, H) for i in range(n_layers)]
        
    def forward(self, x, memory):
        '''
        data = np.random.randint(0, V, (B, M))
        x = torch.from_numpy(data)
        mem = torch.rand((B, M, H))
        m = Decoder(L)
        v = m(x, mem)
        v.shape  # torch.Size([64, 10, 512])
        '''
        x = self.embedding(x)
        x = self.position(x)
        for layer in self.layers:
            x = layer(x, memory)
        return x

In [25]:
data = np.random.randint(0, V, (B, M))
x = torch.from_numpy(data)
m = Encoder(L)
v = m(x)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

In [26]:
data = np.random.randint(0, V, (B, M))
x = torch.from_numpy(data)
mem = torch.rand((B, M, H))
m = Decoder(L)
v = m(x, mem)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])

### 트랜스포머 클래스 정의

In [3]:
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.encoder = Encoder(L)
        self.decoder = Decoder(L)
        
    def forward(self, src, dst):
        '''
        data = np.random.randint(0, V, (B, M))
        src = torch.from_numpy(data)
        data = np.random.randint(0, V, (B, M))
        dst = torch.from_numpy(data)
        src.shape, dst.shape

        m = Transformer()
        v = m(src, dst)
        v.shape  # torch.Size([64, 10, 512])
        '''
        src_encoded = self.encoder(src)
        dst_decoded = self.decoder(dst, src_encoded)
        
        return dst_decoded

In [27]:
data = np.random.randint(0, V, (B, M))
src = torch.from_numpy(data)
data = np.random.randint(0, V, (B, M))
dst = torch.from_numpy(data)
src.shape, dst.shape

m = Transformer()
v = m(src, dst)
v.shape  # torch.Size([64, 10, 512])

torch.Size([64, 10, 512])