# 실습 내용: Transformer 모델링

## 1. Vocab: 한국어 음절 단위

## 2. 데이터: 한국어 Q&A 문장
 - ex.)공무원 시험 죽을 거 같아 --> 철밥통 되기가 어디 쉽겠어요.

# Installation (초기 환경 세팅)

In [1]:
import time
import math
import random
import numpy as np
import json

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader


print(torch.__version__)

1.6.0


In [2]:
# set device
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: {}".format(device))

device: cuda


In [3]:
# for reproducibility
seed = 0

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)

# 0. 데이터 확인
 - 출처: https://github.com/eagle705/pytorch-transformer-chatbot/tree/master/data_in

In [4]:
!head -n 10 "./data/train_chatbot.txt"

'head'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [5]:
!head -n 10 "./data/valid_chatbot.txt"

'head'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [6]:
# (1) 데이터 확보 
# (2) 데이터 검증 
# (3) 데이터 전처리 (Text Normalization, 띄어쓰기, 오타교정)
# (4) Vocab 생성 (Tokenizer, Token 정의) # Token: 음절

# 1. 어휘사전 (Vocab) 생성 // (음절 단위)

In [7]:
PAD_TOKEN_ID = 0
UNK_TOKEN_ID = 1
SOS_TOKEN_ID = 2
EOS_TOKEN_ID = 3

PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

def create_vocab(train_path, valid_path, vocab_path):
    
    data = []
    
    with open(train_path, 'r', encoding='utf-8') as f:
        for line in f:
            for sent in line.strip().split('\t'):
                data.append(sent) 
    
    with open(valid_path, 'r', encoding='utf-8') as f:
        for line in f:
            for sent in line.strip().split('\t'):
                data.append(sent) 
    
    vocab = set()
    for sent in data:
        for char in sent:
            vocab.add(char)
            
    vocab_list = list(sorted(vocab))
    
    vocab_list.insert(0, PAD_TOKEN)
    vocab_list.insert(1, UNK_TOKEN)
    vocab_list.insert(2, SOS_TOKEN)
    vocab_list.insert(3, EOS_TOKEN)
    
    print(vocab_list)

    # 파일로 어휘사전 저장
    with open(vocab_path, 'w', encoding='utf-8') as f:
        f.write(json.dumps(vocab_list, indent=4, ensure_ascii=False))
    

In [8]:
create_vocab(train_path="./data/train_chatbot.txt",
             valid_path="./data/valid_chatbot.txt",
             vocab_path="./vocab.json")

['<pad>', '<unk>', '<sos>', '<eos>', ' ', '!', '%', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ';', '?', 'A', 'B', 'C', 'D', 'L', 'N', 'O', 'P', 'S', 'X', '_', 'a', 'c', 'g', 'j', 'k', 'n', 'o', 's', '~', '…', 'ㅊ', 'ㅋ', 'ㅎ', 'ㅜ', 'ㅠ', '가', '각', '간', '갇', '갈', '감', '갑', '값', '갔', '강', '갖', '같', '갚', '개', '객', '갠', '갯', '갱', '걍', '걔', '거', '걱', '건', '걷', '걸', '검', '겁', '것', '겉', '게', '겐', '겜', '겟', '겠', '겨', '격', '겪', '견', '결', '겹', '겼', '경', '곁', '계', '곗', '고', '곡', '곤', '곧', '골', '곰', '곱', '곳', '공', '과', '관', '광', '괘', '괜', '괴', '교', '구', '국', '군', '굳', '굴', '굶', '굽', '굿', '궁', '궈', '권', '궜', '귀', '귄', '귈', '귐', '규', '균', '귤', '그', '극', '근', '글', '긁', '금', '급', '긋', '긍', '기', '긴', '길', '김', '깃', '깅', '깊', '까', '깍', '깎', '깐', '깔', '깜', '깝', '깠', '깡', '깨', '깬', '깰', '깼', '꺼', '꺽', '껀', '껄', '껏', '께', '껴', '꼈', '꼬', '꼭', '꼰', '꼴', '꼼', '꼿', '꽁', '꽂', '꽃', '꽈', '꽉', '꽝', '꽤', '꾸', '꾹', '꾼', '꿀', '꿈', '꿎', '꿔', '꿧', '꿨', '꿩', '꿰', '뀌', '뀐', '뀔', '끄', '끈', '끊', '끌'

# 2. Dataset, DataLoader

In [9]:
class QnADataset(Dataset):
    def __init__(self, data_path, vocab_path):
        super().__init__()
        
        # 전처리 단계
        self.char2index, self.index2char = self._read_vocab(vocab_path)
        self.data = self._preprocess(data_path)
    
    def _read_vocab(self, vocab_path):
        with open(vocab_path, encoding="utf-8") as f:
            labels = json.load(f)
            char2index = dict()
            index2char = dict()

            for index, char in enumerate(labels):
                char2index[char] = index
                index2char[index] = char
            
        return char2index, index2char
    
    def _preprocess(self, data_path):
        data = []
        
        with open(data_path, encoding="utf-8") as f:
            for line in f:
                sents = line.strip().split('\t')
                assert len(sents) == 2, "data error!!"
                question_sent, answer_sent = sents[0], sents[1]
                
                data.append((question_sent, answer_sent))
        return data
    
    @property
    def vocab_size(self):
        return len(self.char2index)
    
    # 필수 구현
    def __len__(self):
        return len(self.data)
    
    # 필수 구현
    def __getitem__(self, index):
        qna = self.data[index]
        q_sent, a_sent = qna[0], qna[1]
        
        src = [self.char2index.get(SOS_TOKEN)]
        src += [self.char2index.get(token, UNK_TOKEN_ID) for token in q_sent]
        src += [self.char2index.get(EOS_TOKEN)]
        
        tgt = [self.char2index.get(SOS_TOKEN)]
        tgt += [self.char2index.get(token, UNK_TOKEN_ID) for token in a_sent]
        tgt += [self.char2index.get(EOS_TOKEN)]
        
        return torch.LongTensor(src), torch.LongTensor(tgt)
    

def text_collate_fn(batch):
    xs = [x for x, y in batch]
    xs_pad = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=PAD_TOKEN_ID)
    xs_lengths = [x.size(0) for x, y in batch]
    xs_lengths = torch.LongTensor(xs_lengths)

    ys = [y for x, y in batch]
    ys_pad = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=PAD_TOKEN_ID)
    ys_lengths = [y.size(0) for x, y in batch]
    ys_lengths = torch.LongTensor(ys_lengths)

    return xs_pad, xs_lengths, ys_pad, ys_lengths

In [10]:
train_dataset = QnADataset(data_path="./data/train_chatbot.txt",
                           vocab_path="./vocab.json")

valid_dataset = QnADataset(data_path="./data/valid_chatbot.txt",
                           vocab_path="./vocab.json")

In [11]:
# train_dataset[0]  # train_dataset.__getitem__(0)

In [12]:
batch_size = 32

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=text_collate_fn,
                          drop_last=False)

valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          collate_fn=text_collate_fn,
                          drop_last=False)

In [13]:
for x, x_len, y, y_len in train_loader:
    print(x, y)
    print(x_len, y_len)
    break

tensor([[   2,   69,  936,  ...,    0,    0,    0],
        [   2,  779,  478,  ...,    0,    0,    0],
        [   2,  822,  444,  ...,    0,    0,    0],
        ...,
        [   2,  205,  465,  ...,    0,    0,    0],
        [   2, 1195,  773,  ...,    0,    0,    0],
        [   2,  805,  268,  ...,    0,    0,    0]]) tensor([[   2,  932,  707,  ...,    0,    0,    0],
        [   2,  704,  773,  ...,    0,    0,    0],
        [   2,  648,  444,  ...,    0,    0,    0],
        ...,
        [   2,  647,  908,  ...,    0,    0,    0],
        [   2, 1195,  773,  ...,  825,   10,    3],
        [   2,  129,  480,  ...,    0,    0,    0]])
tensor([14, 14, 15, 15, 15, 12, 11, 14,  9, 11, 10,  8, 10, 16, 13, 40, 17, 13,
        24, 11, 11,  4, 13,  8, 16, 35, 27, 20, 17, 19, 11, 12]) tensor([20, 18, 17, 10, 21, 17, 14, 24, 10, 13, 15, 14, 15, 12, 24, 20, 13, 22,
        25, 12, 15, 13, 14, 24, 11, 22, 21, 30, 15, 10, 32, 19])


# 3. Transformer Model

## 3-1. Embedding 

In [14]:
class Embeddings(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_id=0):
        super(Embeddings, self).__init__()
        
        self.token_embedding = TokenEmbedding(vocab_size=vocab_size, 
                                              embed_size=embed_size, 
                                              pad_id=pad_id)
        self.pos_encoding = PositionalEncoding(d_model=embed_size, 
                                                 max_len=256)

    def forward(self, x):
        token_emb = self.token_embedding(x)
        pos_enc = self.pos_encoding(x)
        return token_emb + pos_enc


class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, pad_id):
        super(TokenEmbedding, self).__init__()
        
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=pad_id)

    def forward(self, x):
        x_embed = self.token_embedding(x)
        return x_embed


class PositionalEncoding(nn.Module):
    """
    ref: https://github.com/codertimo/BERT-pytorch/blob/master/bert_pytorch/model/embedding/position.py
    """
    def __init__(self, d_model, max_len=256):
        super().__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]

## 3-2. Transformer

In [15]:
class Transformer(nn.Module):
    def __init__(self, input_size, output_size, pad_id, device):
        super(Transformer, self).__init__()

        self.device = device

        src_vocab_size = input_size
        tgt_vocab_size = output_size

        d_model = 512
        n_head = 8
        num_encoder_layers = 6
        num_decoder_layers = 6
        dim_feedforward = 2048
        dropout = 0.1

        self.encoder_dropout = nn.Dropout(dropout)
        self.decoder_dropout = nn.Dropout(dropout)

        self.encoder_embedding = Embeddings(vocab_size=src_vocab_size, embed_size=d_model, pad_id=pad_id) # embed_size = d_model
        self.decoder_embedding = Embeddings(vocab_size=tgt_vocab_size, embed_size=d_model, pad_id=pad_id)
        
        self.transformer = torch.nn.Transformer(d_model=d_model,
                                                nhead=n_head,
                                                num_encoder_layers=num_encoder_layers,
                                                num_decoder_layers=num_decoder_layers,
                                                dim_feedforward=dim_feedforward,
                                                dropout=dropout)

        self.proj_vocab_layer = nn.Linear(in_features=d_model, out_features=tgt_vocab_size)
        self.softmax = nn.LogSoftmax(dim=-1)

    def _generate_mask(self, x, length):
        mask = []

        max_length = max(length)
        for l in length:
            if max_length - l > 0:
                # If the length is shorter than maximum length among samples,
                # set last few values to be 1s to remove attention weight.
                mask += [torch.cat([x.new_ones(1, l).zero_(),
                                    x.new_ones(1, (max_length - l))
                                    ], dim=-1)]
            else:
                # If the length of the sample equals to maximum length among samples,
                # set every value in mask to be 0.
                mask += [x.new_ones(1, l).zero_()]

        mask = torch.cat(mask, dim=0).bool()

        return mask

    def forward(self, enc_input, dec_input, enc_input_len, dec_input_len):
        
        x_enc_embed = self.encoder_dropout(self.encoder_embedding(enc_input.long()))
        x_dec_embed = self.decoder_dropout(self.decoder_embedding(dec_input.long()))
        # print(x_enc_embed.size(), x_enc_embed)
        # print(x_dec_embed.size(), x_dec_embed)

        src_key_padding_mask = self._generate_mask(enc_input, enc_input_len).to(self.device)
        tgt_key_padding_mask = self._generate_mask(dec_input, dec_input_len).to(self.device)
        # print(enc_input_len, src_key_padding_mask)
        # print(dec_input_len, tgt_key_padding_mask)
        
        memory_key_padding_mask = src_key_padding_mask
        # src_mask = self.transformer.generate_square_subsequent_mask(x_enc_embed.size(1)).to(self.device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(x_dec_embed.size(1)).to(self.device)
        
        # print(x_enc_embed.size(1), src_mask)
        # print(x_dec_embed.size(1), tgt_mask)
        
        x_enc_embed = x_enc_embed.transpose(0, 1)
        x_dec_embed = x_dec_embed.transpose(0, 1)
        

        feature = self.transformer(src=x_enc_embed,
                                   tgt=x_dec_embed,
                                   src_key_padding_mask=src_key_padding_mask,
                                   tgt_key_padding_mask=tgt_key_padding_mask,
                                   memory_key_padding_mask=memory_key_padding_mask,
                                   src_mask=None,
                                   tgt_mask=tgt_mask)

        logits = self.proj_vocab_layer(feature)
        logits = self.softmax(logits)
        logits = logits.transpose(0, 1)
        # print(logits.size(), logits)

        return logits

    def search(self, enc_input, max_length=255, sos_id=2, eos_id=3):
        
        SOS_token = sos_id
        EOS_token = eos_id
        
        y_hats, indice = [], []
        
        with torch.no_grad():
            
            # ENCODER
            x_enc_embed = self.encoder_dropout(self.encoder_embedding(enc_input.long()))
            # src_mask = self.transformer.generate_square_subsequent_mask(x_enc_embed.size(1)).to(self.device)
            
            enc_input_len = torch.LongTensor([enc_input.size(1)])
            src_key_padding_mask = self._generate_mask(enc_input, enc_input_len).to(self.device)
            
            memory_key_padding_mask = src_key_padding_mask
            
            x_enc_embed = x_enc_embed.transpose(0, 1)
            src = x_enc_embed
            
            memory = self.transformer.encoder(src,
                                              mask=None,
                                              src_key_padding_mask=src_key_padding_mask)
            
            # DECODER
            dec_input = torch.LongTensor([[SOS_token]]).to(self.device)
            dec_input_len = torch.LongTensor([dec_input.size(-1)]).to(self.device)
            
            for di in range(max_length):
                x_dec_embed = self.decoder_dropout(self.decoder_embedding(dec_input.long()))
                tgt_mask = self.transformer.generate_square_subsequent_mask(x_dec_embed.size(1)).to(self.device)
                tgt_key_padding_mask = self._generate_mask(dec_input, dec_input_len).to(self.device)

                x_dec_embed = x_dec_embed.transpose(0, 1)
                tgt = x_dec_embed

                output = self.transformer.decoder(tgt,
                                                  memory,
                                                  tgt_mask=tgt_mask,
                                                  memory_mask=None,
                                                  tgt_key_padding_mask=tgt_key_padding_mask,
                                                  memory_key_padding_mask=memory_key_padding_mask)

                logits = self.proj_vocab_layer(output)
                logits = self.softmax(logits)
                y_pred = logits.transpose(0, 1)
                y_hats += [y_pred]

                y_pred_ids = y_pred.max(dim=-1)[1]
                # print("y_pred_ids : ", y_pred_ids)
                # indice += [y_pred_ids]

                dec_input = torch.cat([dec_input, y_pred_ids[0,-1].unsqueeze(0).unsqueeze(0)], dim=-1).to(self.device)
                # print("({}) dec_input: {}".format(di, dec_input))

                dec_input_len = torch.LongTensor([dec_input.size(-1)]).to(self.device)
                
                if y_pred_ids[0,-1].item() == EOS_token:
                    break
            
            y_hats = torch.cat(y_hats, dim=1)
            
            indice = dec_input[:, 1:]
    
        return y_hats, indice
    

In [16]:
input_size = train_dataset.vocab_size
output_size = train_dataset.vocab_size

print(input_size, output_size)

model = Transformer(input_size=input_size, 
                    output_size=output_size,
                    pad_id=PAD_TOKEN_ID, 
                    device=device).to(device)

print(model)

1246 1246
Transformer(
  (encoder_dropout): Dropout(p=0.1, inplace=False)
  (decoder_dropout): Dropout(p=0.1, inplace=False)
  (encoder_embedding): Embeddings(
    (token_embedding): TokenEmbedding(
      (token_embedding): Embedding(1246, 512, padding_idx=0)
    )
    (pos_encoding): PositionalEncoding()
  )
  (decoder_embedding): Embeddings(
    (token_embedding): TokenEmbedding(
      (token_embedding): Embedding(1246, 512, padding_idx=0)
    )
    (pos_encoding): PositionalEncoding()
  )
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((

# 4. Train

In [17]:
# optimizer
learning_rate = 0.0001  # 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

# criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=PAD_TOKEN_ID).to(device)

# log_softmax + NLLLoss == CrossEntropyLoss
criterion = torch.nn.NLLLoss(reduction='mean', ignore_index=PAD_TOKEN_ID).to(device)

In [18]:
def train(model, data_loader, optimizer, criterion, clip, epoch):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(data_loader):
        
        src, src_len, trg, trg_len = batch
        src = src.to(device)
        trg = trg.to(device)
        src_len = src_len.to(device)
        trg_len = trg_len.to(device)
        
        optimizer.zero_grad()
        
        output = model(enc_input=src,
                       dec_input=trg[:, :-1],
                       enc_input_len=src_len,
                       dec_input_len=(trg_len-1))
        
        #trg = [batch size, trg len]
        #output = [batch size, trg len, output dim]
        
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        log_interval = 100
        if i % log_interval == 0 and i >= 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches | loss {:.4f}'.format(epoch+1, i+1, len(data_loader), loss.detach().item()))
        
    return epoch_loss / len(data_loader)

In [19]:
def evaluate(model, data_loader, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():

        for i, batch in enumerate(data_loader):

            src, src_len, trg, trg_len = batch
            src = src.to(device)
            trg = trg.to(device)
            src_len = src_len.to(device)
            trg_len = trg_len.to(device)

            output = model(enc_input=src,
                           dec_input=trg[:, :-1],
                           enc_input_len=src_len,
                           dec_input_len=(trg_len-1))

            #trg = [batch size, trg len]
            #output = [batch size, trg len, output dim]

            output_dim = output.shape[-1]
        
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(data_loader)

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
N_EPOCHS = 100 # 25
CLIP = 5

best_train_loss = float('inf')
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP, epoch)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), './models/transformer/train.loss.best.pt')
        print(f'Epoch: {epoch+1:02} | train.loss.best: {epoch+1}')
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/transformer/valid.loss.best.pt')
        print(f'Epoch: {epoch+1:02} | valid.loss.best: {epoch+1}')
        
    # lr_scheduler.step(valid_loss)
    lr_scheduler.step(train_loss)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.4f} | Train PPL: {math.exp(train_loss):8.4f}')
    print(f'\t Val. Loss: {valid_loss:.4f} |  Val. PPL: {math.exp(valid_loss):8.4f}')
    print("#"*100)

| epoch   1 |     1/  351 batches | loss 7.3256
| epoch   1 |   101/  351 batches | loss 3.4596
| epoch   1 |   201/  351 batches | loss 3.2762
| epoch   1 |   301/  351 batches | loss 2.8678
Epoch: 01 | train.loss.best: 1
Epoch: 01 | valid.loss.best: 1
Epoch: 01 | Time: 0m 50s
	Train Loss: 3.4564 | Train PPL:  31.7038
	 Val. Loss: 2.7499 |  Val. PPL:  15.6408
####################################################################################################
| epoch   2 |     1/  351 batches | loss 2.7533
| epoch   2 |   101/  351 batches | loss 2.7647
| epoch   2 |   201/  351 batches | loss 2.5173
| epoch   2 |   301/  351 batches | loss 2.7535
Epoch: 02 | train.loss.best: 2
Epoch: 02 | valid.loss.best: 2
Epoch: 02 | Time: 0m 49s
	Train Loss: 2.6090 | Train PPL:  13.5851
	 Val. Loss: 2.4009 |  Val. PPL:  11.0330
####################################################################################################
| epoch   3 |     1/  351 batches | loss 2.3137
| epoch   3 |   101/  35

| epoch  19 |   101/  351 batches | loss 1.1675
| epoch  19 |   201/  351 batches | loss 1.1103
| epoch  19 |   301/  351 batches | loss 0.9882
Epoch: 19 | train.loss.best: 19
Epoch: 19 | valid.loss.best: 19
Epoch: 19 | Time: 0m 47s
	Train Loss: 1.0176 | Train PPL:   2.7666
	 Val. Loss: 1.4759 |  Val. PPL:   4.3749
####################################################################################################
| epoch  20 |     1/  351 batches | loss 0.9503
| epoch  20 |   101/  351 batches | loss 0.9377
| epoch  20 |   201/  351 batches | loss 0.9543
| epoch  20 |   301/  351 batches | loss 0.9824
Epoch: 20 | train.loss.best: 20
Epoch: 20 | valid.loss.best: 20
Epoch: 20 | Time: 0m 48s
	Train Loss: 0.9751 | Train PPL:   2.6514
	 Val. Loss: 1.4724 |  Val. PPL:   4.3595
####################################################################################################
| epoch  21 |     1/  351 batches | loss 1.0032
| epoch  21 |   101/  351 batches | loss 0.8949
| epoch  21 |   201/

| epoch  38 |   101/  351 batches | loss 0.2925
| epoch  38 |   201/  351 batches | loss 0.3077
| epoch  38 |   301/  351 batches | loss 0.4195
Epoch: 38 | train.loss.best: 38
Epoch: 38 | Time: 0m 48s
	Train Loss: 0.3637 | Train PPL:   1.4387
	 Val. Loss: 1.6419 |  Val. PPL:   5.1648
####################################################################################################
| epoch  39 |     1/  351 batches | loss 0.3023
| epoch  39 |   101/  351 batches | loss 0.2780
| epoch  39 |   201/  351 batches | loss 0.3616
| epoch  39 |   301/  351 batches | loss 0.3718
Epoch: 39 | train.loss.best: 39
Epoch: 39 | Time: 0m 48s
	Train Loss: 0.3315 | Train PPL:   1.3931
	 Val. Loss: 1.6902 |  Val. PPL:   5.4204
####################################################################################################
| epoch  40 |     1/  351 batches | loss 0.2578
| epoch  40 |   101/  351 batches | loss 0.3102
| epoch  40 |   201/  351 batches | loss 0.2900
| epoch  40 |   301/  351 batches | 

| epoch  57 |   101/  351 batches | loss 0.0962
| epoch  57 |   201/  351 batches | loss 0.0700
| epoch  57 |   301/  351 batches | loss 0.0949
Epoch: 57 | train.loss.best: 57
Epoch: 57 | Time: 0m 47s
	Train Loss: 0.1078 | Train PPL:   1.1138
	 Val. Loss: 2.0340 |  Val. PPL:   7.6448
####################################################################################################
| epoch  58 |     1/  351 batches | loss 0.0908
| epoch  58 |   101/  351 batches | loss 0.1020
| epoch  58 |   201/  351 batches | loss 0.0854
| epoch  58 |   301/  351 batches | loss 0.1123
Epoch: 58 | train.loss.best: 58
Epoch: 58 | Time: 0m 48s
	Train Loss: 0.1058 | Train PPL:   1.1116
	 Val. Loss: 2.0500 |  Val. PPL:   7.7678
####################################################################################################
| epoch  59 |     1/  351 batches | loss 0.0801
| epoch  59 |   101/  351 batches | loss 0.0962
| epoch  59 |   201/  351 batches | loss 0.1272
| epoch  59 |   301/  351 batches | 

| epoch  76 |   201/  351 batches | loss 0.0654
| epoch  76 |   301/  351 batches | loss 0.1071
Epoch: 76 | train.loss.best: 76
Epoch: 76 | Time: 0m 48s
	Train Loss: 0.0691 | Train PPL:   1.0716
	 Val. Loss: 2.1882 |  Val. PPL:   8.9187
####################################################################################################
| epoch  77 |     1/  351 batches | loss 0.0908
| epoch  77 |   101/  351 batches | loss 0.0775
| epoch  77 |   201/  351 batches | loss 0.0558
| epoch  77 |   301/  351 batches | loss 0.0852
Epoch: 77 | Time: 0m 48s
	Train Loss: 0.0692 | Train PPL:   1.0716
	 Val. Loss: 2.2230 |  Val. PPL:   9.2350
####################################################################################################
| epoch  78 |     1/  351 batches | loss 0.0658
| epoch  78 |   101/  351 batches | loss 0.0370
| epoch  78 |   201/  351 batches | loss 0.0597
| epoch  78 |   301/  351 batches | loss 0.0804
Epoch: 78 | train.loss.best: 78
Epoch: 78 | Time: 0m 48s
	Train Loss

Epoch: 95 | Time: 0m 48s
	Train Loss: 0.0189 | Train PPL:   1.0191
	 Val. Loss: 2.2706 |  Val. PPL:   9.6849
####################################################################################################
| epoch  96 |     1/  351 batches | loss 0.0356
| epoch  96 |   101/  351 batches | loss 0.0108
| epoch  96 |   201/  351 batches | loss 0.0301
| epoch  96 |   301/  351 batches | loss 0.0100
Epoch    96: reducing learning rate of group 0 to 2.5000e-05.
Epoch: 96 | Time: 0m 48s
	Train Loss: 0.0199 | Train PPL:   1.0201
	 Val. Loss: 2.2739 |  Val. PPL:   9.7172
####################################################################################################
| epoch  97 |     1/  351 batches | loss 0.0256
| epoch  97 |   101/  351 batches | loss 0.0209
| epoch  97 |   201/  351 batches | loss 0.0043
| epoch  97 |   301/  351 batches | loss 0.0075
Epoch: 97 | train.loss.best: 97
Epoch: 97 | Time: 0m 49s
	Train Loss: 0.0153 | Train PPL:   1.0154
	 Val. Loss: 2.2537 |  Val. PPL:   

In [22]:
model.load_state_dict(torch.load('./models/transformer/train.loss.best.pt'))

test_loss = evaluate(model, valid_loader, criterion)

print(f'| Test Loss: {test_loss:.4f} | Test PPL: {math.exp(test_loss):8.4f} |')

| Test Loss: 2.2774 | Test PPL:   9.7518 |


# 5. Inference (검증용)

In [23]:
def inference(model, q_sent="", a_sent=None, char2index=None, index2char=None):
    
    model.eval()
    with torch.no_grad():
        
        src = [char2index.get(SOS_TOKEN)]
        src += [char2index.get(token, UNK_TOKEN_ID) for token in q_sent]
        src += [char2index.get(EOS_TOKEN)]
        
        trg = [char2index.get(SOS_TOKEN)]
        trg += [char2index.get(token, UNK_TOKEN_ID) for token in a_sent]
        trg += [char2index.get(EOS_TOKEN)]

        src = torch.LongTensor([src]).to(device)
        trg = torch.LongTensor([trg]).to(device)

        hyp_ys, hyp_indice = model.search(enc_input=src, max_length=80)
        
        pred = hyp_indice[0].detach().cpu().numpy()
        print(pred)
        
        pred_sent = [index2char[token_id] for token_id in pred]
        pred_sent = ''.join(pred_sent)
        
        print(f"H: ({pred_sent})")

In [24]:
idx = np.random.randint(len(train_dataset), size=1)[0]
print("random idx : ", idx)

q_sent = train_dataset.data[idx][0]
a_sent = train_dataset.data[idx][1]
print("Q: ", q_sent)
print("A: ", a_sent)

inference(model, q_sent, a_sent, char2index=train_dataset.char2index, index2char=train_dataset.index2char)

random idx :  2732
Q:  사랑이 밥 먹여주나
A:  사랑이 밥은 먹여주지 않지만 행복을 줘요.
[ 638  406  857    4  549  852    4  497  791  913  930    4  756  930
  480    4 1186  579  853    4  920  825   10    3]
H: (사랑이 밥은 먹여주지 않지만 행복을 줘요.<eos>)
