## 데이터 준비

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def tokenizer_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenizer_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize=tokenizer_de,
           init_token='<SOS>',
           eos_token='<EOS>',
           lower=True,
           batch_first=True)
TRG = Field(tokenize=tokenizer_en,
           init_token='<SOS>',
           eos_token='<EOS>',
           lower=True,
           batch_first=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(SRC, TRG))
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data),
                                                         batch_size=16,
                                                         device=device)

## Encoder 종합

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        # input_dim, max_length : 인식범위
        # hid dim == d_model
        # pf_dim == d_ff
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(input_dim, hid_dim) # [len(src vocab), emb dim]
        self.pos_embedding = nn.Embedding(max_length, hid_dim) # 우리 모델은 최대 max_length 만큼의 토큰 개수 만큼을 '한 문장'으로 받아들일 수 있다
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)]) 
        self.droput = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
    def forward(self, src, src_mask):
        # src : [batch size, src time steps]
        # src_mask : [batch size, 1, 1, src time steps]
        batch_size = src.shape[0]
        src_time_steps = src.shape[1]
        
        # batch마다 token index 구해서 token의 위치 값을 만든다
        pos = torch.arange(0, src_time_steps).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        # pos : [batch size, src time steps]
        
        # 임베딩 테이블은 차원이 다르지만, 룩업 테이블은 차원이 같다. 그러므로 더할 수 있다.
        src = self.dropout((self.tok_embedding * self.scale) + self.pos_embedding(pos))
        # src : [batch size, src time steps, hid dim]

        for layer in self.layers:
            src = layer(src, src_mask)
        return src # Stacked Encoder의 최종 출력 값

## Encoder Block

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.feedforward = PositionWiseFeedForward(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src, src_mask):
        # src : [batch size, src time steps, hid dim]
        # src_mask : [batch size, 1, 1, src time steps]
        _src, _ = self.self_attention(src, src, src, src_mask) # QKV 계산하기 위해서 src 3개 copy
        src = self.self_attn_layer_norm(src + self.dropout(_src)) # Residual Connection + Norm
        
        _src = self.feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

## Multi-Head Attention

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        assert hid_dim % n_heads == 0
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
    def forward(self, query, key, value, mask=None):
        # query : [batch size, src time steps, hid dim]
        # key : [batch size, src time steps, hid dim]
        # value : [batch size, src time steps, hid dim]
        batch_size = query.shape[0]
        Q = self.fc_q(query) # [batch size, src time steps, hid dim]
        K = self.fc_k(key) 
        V = self.fc_v(value) 
        
        # [batch size, n_heads, src time stpes, head dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale # Attention Score
        # [batch size, n_heads, src time steps, src time steps]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0. -1e10) # padding(0인 부분)에 -무한대 넣기
        
        att_weights = torch.softmax(energy, dim=-1) # [batch size, n_heads, src time steps, src time steps]
        
        att_values = torch.matmul(self.dropout(att_weights), V)
        att_values = att_values.permute(0, 2, 1, 3).contiguous()
        # att_values : [batch size, src time steps, n heads, head dim]
        att_values = att_values.view(batch_size, -1, self.hid_dim) # view로 concat을 대체한다
        # att_values : [batch size, src time steps, hid dim]
        att_values = att_values.fc_o(att_values)
        
        return att_values, att_weights

## Position Wise Feed Forward

In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        # x : [batch size, src time steps, hid dim]
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

## Decoder 종합

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, dropout, device, max_length=100):
        super().__init__()
        self.device = device
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, n_heads, pf_dim, dropout, device) for _ in range(n_layers)])
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg : [batch, trg time steps]
        # enc_src : [batch, src time steps, hid dim]
        # trg_mask : [batch, 1, trg time steps, trg time steps]
        # src_mask : [batch, 1, 1, src time step]
        batch_size = trg.shape[0]
        trg_time_steps = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(device) # [batch, trg time steps]
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos)) # [batch, trg time steps, hid dim]
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
            # attention : [batch, n heads, trg time steps, src time steps]
        output = self.fc_out(trg)
        # output : [batch, trg time steps, len(trg vocab)]
        return output, attention    

## Decoder Block

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, dropout, device):
        super().__init__()
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.feedforward = PositionWiseFeedForward(hid_dim, pf_dim, dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, trg, enc_src, trg_mask, src_mask):
        # trg       : [batch, trg time steps, hid dim]
        # enc_src   : [batch, src time steps, hid dim]
        # trg_mask  : [batch, 1, trg time steps, trg time steps]
        # src_mask  : [batch, 1, 1, src time steps]
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        
        _trg = self.feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

## 연결

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
    def make_src_mask(self, src):
        # src : [batch, src time steps]
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # src_mask : [batch, 1, 1, src time steps]
        return src_mask
    def make_trg_mask(self, trg):
        # trg : [batch, trg time steps]
        
        # 데이터의 pad만 True
        # padding 전용 mask
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        # trg_pad_mask : [batch, 1, 1, trg time step]
        trg_time_steps = trg.shape[1]
        
        # diagonal 윗 부분만 False(=padding), 나머지는 True
        # decoder 전용 mask
        trg_sub_mask = torch.tril(torch.ones((trg_time_steps, trg_time_steps), device=self.device)).bool()
        # trg_sub_mask : [trg time steps, trg time steps]
        
        # padding된 값인데(=False) decoder 마스크에는 포함이 안된다면? 최종적으로 False처리해야 함
        trg_mask = trg_pad_mask & trg_sub_mask # 둘 모두 참인 경우만 True로 살림
        # trg mask : [batch, 1, trg time steps, trg time steps]
        return trg_mask
    def forward(self, src, trg):
        # src : [batch ,src time steps, hid dim]
        # trg : [batch, trg time steps, hid dim]
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        # output : [batch, trg time steps, trg vocab size]
        # attention : [batch, n heads, trg time stpes, src time steps] = att weights
        return output, attention

In [16]:
import torch

a = torch.FloatTensor([[0, 0], [1, 1], [2, 2]])
b = torch.FloatTensor([[3, 3], [1, 1], [4, 4]])

sub = torch.tril(torch.ones((a.shape[1], a.shape[1]))).bool()
pad = (a != b).unsqueeze(1).unsqueeze(2)
print(sub, '\n', pad, '\n', sub & pad)

tensor([[ True, False],
        [ True,  True]]) 
 tensor([[[[ True,  True]]],


        [[[False, False]]],


        [[[ True,  True]]]]) 
 tensor([[[[ True, False],
          [ True,  True]]],


        [[[False, False],
          [False, False]]],


        [[[ True, False],
          [ True,  True]]]])


## 모델 준비

In [None]:
INPUT_DIM, OUTPUT_DIM = len(SRC.vocab), len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS, DEC_LAYERS = 3, 3
ENC_HEADS, DEC_HEADS = 8, 8
ENC_PF_DIM, DEC_PF_DIM = 512, 512
ENC_DROPOUT, DEC_DROPOUT = 0.1, 0.1

enc = Encoder(INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device)

src_pad_idx = SRC.vocab.stoi[SRC.pad_token]
trg_pad_idx = TRG.vocab.stoi[TRG.pad_token]

model = Seq2seq(enc, dec, src_pad_idx, trg_pad_idx, device).to(device)

model.apply(lambda x: nn.init.xavier_uniform_(x.weight.data) if hasattr(x, 'weights') and x.weight.dim() > 1)

lr = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

## 훈련 준비

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch.src, batch.trg
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1]) # <eos> 제외
        # output : [batch, trg time steps - 1, len(trg vocab)]
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim) # transpose ==> contiguous
        trg = trg[:,1:].contiguous().view(-1) # <sos> 제외
        # output : [batch * trg time steps - 1, output dim]
        # trg : [batch * trg time steps -1]
        
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parmeters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [29]:
import torch
a = torch.arange(0, 32).view(2,4,4)
a, a.size(), a.stride()

# stride는 자신의 axis 이전 값들을 모두 곱한 값이다
# 16 = shape[1] * shape[2] = 4 * 4
# 4 = shape[2] = 4

(tensor([[[ 0,  1,  2,  3],
          [ 4,  5,  6,  7],
          [ 8,  9, 10, 11],
          [12, 13, 14, 15]],
 
         [[16, 17, 18, 19],
          [20, 21, 22, 23],
          [24, 25, 26, 27],
          [28, 29, 30, 31]]]),
 torch.Size([2, 4, 4]),
 (16, 4, 1))

In [44]:
b = torch.randn(2,4,4)
print(' size: ', b.size(), '\n', 'stride: ', b.stride())
print(' transpose size: ', b.transpose(0, 1).size(), '\n','transpose stride: ', b.transpose(0, 1).stride())
# transpose stride는 왜 8, 4, 1이 아닐까?
# 그건 데이터를 읽는 방식 때문이다!

# stride도 같이 transpose 해버린다!!!!
# 즉, 데이터 자체를 바꾼게 아니라, 메모리에서 읽어들이는 인덱스만 바꾼 것이다.

## 결론
# Transpose를 썼으면 contiguous()를 써야한다
# 그래야 데이터의 저장이 실제로 바뀌어서, 병렬 계산이 가능해진다
# 즉, stride와 shape이 호환 가능해진다.

 size:  torch.Size([2, 4, 4]) 
 stride:  (16, 4, 1)
 transpose size:  torch.Size([4, 2, 4]) 
 transpose stride:  (4, 16, 1)


## 추론 준비

In [None]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch.src, batch.trg
        output, _ = model(src, trg[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg)
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)        

## 에포크 동작

In [None]:
clip = 1

for epoch in range(100):
    train_loss = train(model, train_iter, optimizer, criterion, clip)
    valid_loss = evaluate(model, valid_iter, criterion)
    print('train loss: ', train_loss, '\n', 'valid loss: ', valid_loss)

test_loss = evaluate(model, test_iter, criterion)
print('test_loss: ', test_loss)