### 바다나우 어텐션 vs 류옹 어텐션  
| Dot Attention | Bahdanau Attention |
|--|--|
| $score(s_{t},\ h_{i}) = s^{T}_{t}h_{i}$ | $score(s_{t-1},\ H) = W_{a}^{T}\ tanh(W_{b}s_{t-1}+W_{c}H)$ |
| RNN cell 초기값을 encoder의 last hidden state로 설정한다 | 1) **context vector를 input과 concat**해서 RNN cell의 입력으로 보낸다 |
| 1) context vector를 s_t와 concat한 뒤 출력층 계산 | 2) 그렇게 나온 s_t와 context vector를 다시 concat해서 출력층 계산 |

\*) context vector == attention value == softmax(attention score) * Values  
\**) s_t == decoder output == decoder hidden state  

1. decoder att은 무엇과 concat 하는가?  
 v[s1 ; att_0] => Dense(len(vocab))  

2. code 상에 query와 key는 어떻게 표현되는가?(하이퍼 파라미터, 인풋)  
encoder의 hidden states가 K, V다  
쿼리는??  decoder의 cell state s_t

3. att에서 bidirection은 default인가  
ㄴㄴ  

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

SRC = Field(tokenize=tokenize_de,
           init_token='<SOS>',
           eos_token='<EOS>',
           lower=True)
TGT = Field(tokenize=tokenize_en,
           init_token='<SOS>',
           eos_token='<EOS>',
           lower=True)

train_data, valid_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                   fields=(SRC, TRG))

SRC.build_vocab(train_data, min_freq=2)
TGT.build_vocab(train_data, min_freq=2)

batch_size = 128

train_iter, valid_iter, test_iter = BucketIterator.splits((train_data, valid_data, test_data),
                                                         batch_size = batch_size,
                                                         device = device)

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim) # input_dim = len(vocab)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        # 왜 time-step은 하이퍼파라미터가 아닌가? 
        # 배치 사이즈와 time-step은 input data에 shape이기 때문!
        self.fc = nn.Linear(enc_hid_dim*2, dec_hid_dim) # bidirection이니까 뉴런도 두 배로 나오겠지
        self.dropout = nn.Dropout(0.4)
    def forward(self, src)
        # src = (time-step, batch size)
        emb = self.dropout(self.embedding(src))
        # emb = (time-step, batch size, emb dim)
        output, last_hidden_state = self.rnn(emb)
        # output = (time-step, batch size, hid dim * num direction)
        # last hidden state = (num direction * layers, batch size, hid dim)
        # [(forward, backward), (forward, backward), ...] 
        hidden = torch.tanh(self.fc(torch.cat((last_hidden_state[-2::], last_hidden_state[-1::]), dim=1)))
        return output, hidden # 레이어의 마지막 출력층, context vectors
    
        
# 특정 query와 key 값을 concat한 다음 linear 돌려서, query shape으로 만듬
# 그 query shape을 다시 1로 압축함
class Attention(nn.Module):
    # 원래 decoder의 출력 개수는 뉴런 개수(s_t)임
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        # query 뉴런 개수 + key 뉴런 개수 * 2 (bidirection 이니까)
        self.attn = nn.Linear(dec_hid_dim + (enc_hid_dim * 2), dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)
    def forward(self, hidden, encoder_outputs):
        # encoder_output : (time-step, batch_size, enc hid dim * 2) == keys
        # hidden : (batch_size, dec hid dim) == s_{t-1} == query
        batch_size = encoder_outputs.shape[1]
        time_steps = encoder_outputs.shape[0]
        
        # encoder의 output에 time step 개수 만큼 context vector를 반복
        hidden = hidden.unsqueeze(1).repeat(1, time_steps, 1) # (batch_size, time-step, dec hid dim)
        encoder_outputs = encoder_outputs.permute(1, 0, 2) # (batch_size, time-step, enc hid dim * 2)
        
        # 원래는 tf.nn.tanh(self.W1(keys) + self.W2(query)) 인데 걍 concat으로 한 방에 계산 ㄱㄱ
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        # : (batch_size, time-step, dec hid dim)
        
        # attention score: (batch_size, time-step)
        attention = self.v(energy).squeeze(2)
        return F.softmax(attention, dim=1) # att weights
        # 여기다가 value랑 곱하면 context vector


class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim  # len(vocab)
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, attention)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(0.4)
    def forward(self, inputs, hidden, encoder_outputs):
        # inputs : [batch_size]
        # hidden : [batch_size, dec hid dim] == decoder output == query == s_{t-1}
        # encoder_outputs : [time-step, batch_size, enc hid dim * 2] == key == value
        
        inputs = inputs.unsqueeze(0) # [1, batch_size]
        embedded = self.dropout(self.embedding(inputs))
        
        attention_weights = self.attention(hidden, encoder_outputs)
        attention_weights = attention_weights.unsqueeze(1) # [batch_size, 1, time-step] decoder time-step 1추가
        encoder_outputs = encoder_outputs.permute(1, 0, 2) # axis 0 1 2 ==> axis 1 0 2 변경
        
        # context vector : [batch_size, 1, enc_hid_dim * 2]
        weighted = torch.bmm(attention_weights, encoder_outputs) # batch matrix multiplication
        weighted = weighted.permute(1, 0, 2) 
        
        # embedding vector + context vector
        rnn_input = torch.cat([embedded, weighted], dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) # 이전 시점의 hidden을 넣는다
        # output : [time-step, batch_size, dec hid dim * n direction(=1)]
        # hidden : [n layers * n direction, batch_size, dec hid dim]
        assert (output == hidden).all() # [1, batch_size, dec hid dim]
        
        embedded = embedded.squeeze(0) # [batch_size, emb dim]
        weighted = weighted.squeeze(0) # [batch_size, enc hid dim * 2]
        output = output.squeeze(0) # [batch_size, dec hid dim]
        
        # y_hat + context vector + embed (이 구현에서만 embed는 추가된거)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1))
        # : [batch_size, vocab_size]
        return prediction, hidden.squeeze(0) # 예측값, s_t

In [46]:
import torch

tmp = torch.FloatTensor([[[4],[5],[6]],[[1],[2],[3]]])
print(tmp, tmp.size())

# axis 0 1 2 ==> axis 1 0 2 변경
temp = tmp.permute(1, 0, 2)
temp, temp.size()

tensor([[[4.],
         [5.],
         [6.]],

        [[1.],
         [2.],
         [3.]]]) torch.Size([2, 3, 1])


(tensor([[[4.],
          [1.]],
 
         [[5.],
          [2.]],
 
         [[6.],
          [3.]]]),
 torch.Size([3, 2, 1]))

In [None]:
class Seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.device = device()
    def forward(self, src, trg, teacher_forcing_ratio=0.75):
        # src : [encoder time-step, batch_size]
        # trg : [decoder time-step, batch_size]
        batch_size = src.shape[1]
        trg_len = trg.shape[0] # decoder time-step size
        trg_vocab_size = self.decoder.output_dim
        inputs = trg[0,:]  # sos 토큰
        # encoder hidden states, 마지막 셀 hidden state (forward, backward 각각)
        encoder_outputs, last_hidden = self.encoder(src)
        
        # outputs : [decoder time-step, batch_size, decoder vocab size]
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        for token in range(1, trg_len):
            output, hidden = self.decoder(inputs, last_hidden, encoder_outputs) # sos, s_{t-1} query, key=value
            outputs[token] = output # decoder의 prediction
            
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            
            inputs = trg[token] if teacher_force else top1
        return outputs
            

In [44]:
input_dim = len(SRC.vocab)
output_dim = len(TRG.vocab)
enc_emb_dim, dec_emb_dim = 256, 256
enc_hid_dim, dec_hid_dim = 512, 512
enc_dropout, dec_dropout = 0.4, 0.4

attn = Attention(enc_hid_dim, dec_hid_dim)
enc = Encoder(input_dim, enc_emb_dim, enc_hid_dim, dec_hid_dim, enc_dropout)
dec = Decoder(output_dim, dec_emb_dim, enc_hid_dim, dec_hid_dim, dec_dropout, attn)
model = Seq2seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
trg_pad_idx = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
model.apply(init_weights)

0.16781948911776257

In [None]:
def count_params(model):
    cnt = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return print(f'The model has {cnt:,} trainable parameters')
count_params(model)

In [None]:
def train_model(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(iterator):
        src, trg = batch.src, batch.trg
        
        optimizer.zero_grad()
        pred = model(src, trg)  # [trg time-step, batch_size, trg vocab_size]
        
        output_dim = pred.shape[-1]
        pred = pred[1, :].view(-1, output_dim) # [(trg - 1) * batch_size, trg vocab size]
        trg = trg[1, :].view(-1) # [(trg - 1) * batch_size]
        loss = criterion(pred, trg)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def eval_model(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, trg = batch.src, batch.trg
            
            pred = model(src, trg, 0) # teacher forcing 종료
            
            output_dim = pred.shape[-1]
            pred = pred[1, :].view(-1, output_dim)
            trg = trg[1, :].view(-1)
            loss = criterion(pred, trg)
            
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

max_epochs = 10
clip = 1

best_valid_loss = float('inf')

for epoch in range(max_epochs):
    start_time = time.time()
    
    train_loss = train_model(model, train_iter, optimizer, criterion, clip)
    valid_loss = eval_model(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')
    print('Epoch: {:02}, time: {}m{}s train loss: {:.3f}, valid loss: {:.3f}'.format(epoch+1, epoch_mins, epoch_secs, train_loss, valid_loss))    

In [None]:
test_loss = eval_model(model, test_iter, criterion)