## 실습 내용: SEQ2SEQ 모델링

### 1. Vocab: 한국어 음절 단위

### 2. 데이터: 한국어 Q&A 문장
 - ex.)공무원 시험 죽을 거 같아 --> 철밥통 되기가 어디 쉽겠어요.

### Installation (초기 환경 세팅)

In [1]:
import time
import math
import random
import numpy as np
import json

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

print(torch.__version__)

1.6.0


In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"

device = 'cuda' if torch.cuda.is_available() else 'cpu'

print("device: {}".format(device))

device: cuda


In [3]:
seed = 0

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(seed)
random.seed(seed)

## 0. 데이터 확인
 - 출처: https://github.com/eagle705/pytorch-transformer-chatbot/tree/master/data_in

In [4]:
!head -n 10 "./data/train_chatbot.txt"

'head'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


In [5]:
!head -n 10 "./data/valid_chatbot.txt"

'head'은(는) 내부 또는 외부 명령, 실행할 수 있는 프로그램, 또는
배치 파일이 아닙니다.


## 1. 어휘사전 (Vocab) 생성 // (음절 단위)

In [6]:
PAD_TOKEN_ID = 0
UNK_TOKEN_ID = 1

PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'
SOS_TOKEN = '<sos>'
EOS_TOKEN = '<eos>'

def create_vocab(train_path, valid_path, vocab_path):
    
    data = []
    
    with open(train_path, 'r', encoding='utf-8') as f:
        for line in f:
            for sent in line.strip().split('\t'):
                data.append(sent) 
    
    with open(valid_path, 'r', encoding='utf-8') as f:
        for line in f:
            for sent in line.strip().split('\t'):
                data.append(sent) 
    
    vocab = set()
    for sent in data:
        for char in sent:
            vocab.add(char)
            
    vocab_list = list(sorted(vocab))
    
    vocab_list.insert(0, PAD_TOKEN)
    vocab_list.insert(1, UNK_TOKEN)
    vocab_list.insert(2, SOS_TOKEN)
    vocab_list.insert(3, EOS_TOKEN)
    
    print(vocab_list)

    # 파일로 어휘사전 저장
    with open(vocab_path, 'w', encoding='utf-8') as f:
        f.write(json.dumps(vocab_list, indent=4, ensure_ascii=False))
    

In [7]:
create_vocab(train_path="./data/train_chatbot.txt",
             valid_path="./data/valid_chatbot.txt",
             vocab_path="./vocab.json")

['<pad>', '<unk>', '<sos>', '<eos>', ' ', '!', '%', "'", ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ';', '?', 'A', 'B', 'C', 'D', 'L', 'N', 'O', 'P', 'S', 'X', '_', 'a', 'c', 'g', 'j', 'k', 'n', 'o', 's', '~', '…', 'ㅊ', 'ㅋ', 'ㅎ', 'ㅜ', 'ㅠ', '가', '각', '간', '갇', '갈', '감', '갑', '값', '갔', '강', '갖', '같', '갚', '개', '객', '갠', '갯', '갱', '걍', '걔', '거', '걱', '건', '걷', '걸', '검', '겁', '것', '겉', '게', '겐', '겜', '겟', '겠', '겨', '격', '겪', '견', '결', '겹', '겼', '경', '곁', '계', '곗', '고', '곡', '곤', '곧', '골', '곰', '곱', '곳', '공', '과', '관', '광', '괘', '괜', '괴', '교', '구', '국', '군', '굳', '굴', '굶', '굽', '굿', '궁', '궈', '권', '궜', '귀', '귄', '귈', '귐', '규', '균', '귤', '그', '극', '근', '글', '긁', '금', '급', '긋', '긍', '기', '긴', '길', '김', '깃', '깅', '깊', '까', '깍', '깎', '깐', '깔', '깜', '깝', '깠', '깡', '깨', '깬', '깰', '깼', '꺼', '꺽', '껀', '껄', '껏', '께', '껴', '꼈', '꼬', '꼭', '꼰', '꼴', '꼼', '꼿', '꽁', '꽂', '꽃', '꽈', '꽉', '꽝', '꽤', '꾸', '꾹', '꾼', '꿀', '꿈', '꿎', '꿔', '꿧', '꿨', '꿩', '꿰', '뀌', '뀐', '뀔', '끄', '끈', '끊', '끌'

## 2. Dataset, DataLoader

In [8]:
class QnADataset(Dataset):
    def __init__(self, data_path, vocab_path):
        super().__init__()
        
        self.char2index, self.index2char = self._read_vocab(vocab_path)
        self.data = self._preprocess(data_path)
    
    def _read_vocab(self, vocab_path):
        with open(vocab_path, encoding="utf-8") as f:
            labels = json.load(f)
            char2index = dict()
            index2char = dict()

            for index, char in enumerate(labels):
                char2index[char] = index
                index2char[index] = char
            
        return char2index, index2char
    
    def _preprocess(self, data_path):
        data = []
        
        with open(data_path, encoding="utf-8") as f:
            for line in f:
                sents = line.strip().split('\t')
                assert len(sents) == 2, "data error!!"
                question_sent, answer_sent = sents[0], sents[1]
                
                data.append((question_sent, answer_sent))
        return data
    
    @property
    def vocab_size(self):
        return len(self.char2index)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        qna = self.data[index]
        q_sent, a_sent = qna[0], qna[1]
        
        src = [self.char2index.get(SOS_TOKEN)]
        src += [self.char2index.get(token, UNK_TOKEN_ID) for token in q_sent]
        src += [self.char2index.get(EOS_TOKEN)]
        
        tgt = [self.char2index.get(SOS_TOKEN)]
        tgt += [self.char2index.get(token, UNK_TOKEN_ID) for token in a_sent]
        tgt += [self.char2index.get(EOS_TOKEN)]
        
        return torch.LongTensor(src), torch.LongTensor(tgt)
    

def text_collate_fn(batch):
    xs = [x for x, y in batch]
    xs_pad = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=PAD_TOKEN_ID)
    xs_lengths = [x.size(0) for x, y in batch]
    xs_lengths = torch.LongTensor(xs_lengths)

    ys = [y for x, y in batch]
    ys_pad = torch.nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=PAD_TOKEN_ID)
    ys_lengths = [y.size(0) for x, y in batch]
    ys_lengths = torch.LongTensor(ys_lengths)

    return xs_pad, xs_lengths, ys_pad, ys_lengths

In [9]:
train_dataset = QnADataset(data_path="./data/train_chatbot.txt",
                           vocab_path="./vocab.json")

valid_dataset = QnADataset(data_path="./data/valid_chatbot.txt",
                           vocab_path="./vocab.json")

In [10]:
# train_dataset[0]

In [11]:
batch_size = 32 # 4

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=text_collate_fn,
                          drop_last=False)

valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          collate_fn=text_collate_fn,
                          drop_last=False)

In [12]:
for x, x_len, y, y_len in train_loader:
    print(x, y)
    print(x_len, y_len)
    break

tensor([[   2,   69,  936,  ...,    0,    0,    0],
        [   2,  779,  478,  ...,    0,    0,    0],
        [   2,  822,  444,  ...,    0,    0,    0],
        ...,
        [   2,  205,  465,  ...,    0,    0,    0],
        [   2, 1195,  773,  ...,    0,    0,    0],
        [   2,  805,  268,  ...,    0,    0,    0]]) tensor([[   2,  932,  707,  ...,    0,    0,    0],
        [   2,  704,  773,  ...,    0,    0,    0],
        [   2,  648,  444,  ...,    0,    0,    0],
        ...,
        [   2,  647,  908,  ...,    0,    0,    0],
        [   2, 1195,  773,  ...,  825,   10,    3],
        [   2,  129,  480,  ...,    0,    0,    0]])
tensor([14, 14, 15, 15, 15, 12, 11, 14,  9, 11, 10,  8, 10, 16, 13, 40, 17, 13,
        24, 11, 11,  4, 13,  8, 16, 35, 27, 20, 17, 19, 11, 12]) tensor([20, 18, 17, 10, 21, 17, 14, 24, 10, 13, 15, 14, 15, 12, 24, 20, 13, 22,
        25, 12, 15, 13, 14, 24, 11, 22, 21, 30, 15, 10, 32, 19])


## 3. Seq2Seq Model

### 3-1. Encoder

- src = [batch size. src len]
- embedded = [batch size, src len, emb dim]
- outputs = [batch size, src len, hid dim * n directions]
- hidden = [batch size, n layers * n directions, hid dim]
- cell = [batch size, n layers * n directions, hid dim]
- outputs are always from the top hidden layer

In [14]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) # batch_first=True
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

### 3-2. Decoder

- input = [batch size]
- hidden = [batch size, n layers * n directions, hid dim]
- cell = [batch size, n layers * n directions, hid dim]
        
- n directions in the decoder will both always be 1, therefore:
- hidden = [batch size, n layers, hid dim]
- context = [batch size, n layers, hid dim]

- input = [batch size, 1]
- embedded = [batch size, 1, emb dim]
- prediction = [batch size, output dim]
- output = [batch size, seq len, hid dim * n directions]
- hidden = [batch size, n layers * n directions, hid dim]
- cell = [batch size, n layers * n directions, hid dim]
        
- seq len and n directions will always be 1 in the decoder, therefore:
- output = [batch size, 1, hid dim]
- hidden = [batch size, n layers, hid dim]
- cell = [batch size, n layers, hid dim]

In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True) # batch_first=True
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        
        return prediction, hidden, cell

### 3-3. Seq2Seq

- src = [batch size, src len]
- trg = [batch size, trg len]
- teacher_forcing_ratio is probability to use teacher forcing
- e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
- tensor to store decoder outputs
- last hidden state of the encoder is used as the initial hidden state of the decoder
- first input to the decoder is the `<sos>` tokens
- insert input token embedding, previous hidden and previous cell states
- receive output tensor (predictions) and new hidden and cell states
- place predictions in a tensor holding predictions for each token
- decide if we are going to use teacher forcing or not
- get the highest predicted token from our predictions
- if teacher forcing, use actual next token as next input
- if not, use predicted token

In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder mu st be equal!"
        
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[0]
        
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size)
        outputs = outputs.transpose(1, 0).to(self.device)
        
        hidden, cell = self.encoder(src)
        
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            outputs[t] = output
            
            teacher_force = random.random() < teacher_forcing_ratio
            
            top1 = output.argmax(1) 
 
            input = trg[:, t] if teacher_force else top1
        
        return outputs.transpose(1, 0)

In [17]:
INPUT_DIM = train_dataset.vocab_size
OUTPUT_DIM = train_dataset.vocab_size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5 
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(1246, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(1246, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=1246, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


## 4. Train

- trg = [batch size, trg len]
- output = [batch size, trg len, output dim]
- trg = [(trg len - 1) * batch size]
- output = [(trg len - 1) * batch size, output dim]
- trg = [batch size, trg len]
- output = [batch size, trg len, output dim]

In [18]:
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index=PAD_TOKEN_ID).to(device)

In [19]:
def train(model, data_loader, optimizer, criterion, clip, epoch):
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(data_loader):
        src, src_len, trg, trg_len = batch
        src = src.to(device)
        trg = trg.to(device)
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        
        trg = trg[1:].view(-1)
        
        loss = criterion(output, trg)        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
        log_interval = 100
        
        if i % log_interval == 0 and i >= 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches | loss {:.4f}'.format(epoch+1, i+1, len(data_loader), loss.detach().item()))
        
    return epoch_loss / len(data_loader)

- trg = [batch size, trg len]
- output = [batch size, trg len, output dim]
- trg = [(trg len - 1) * batch size]
- output = [(trg len - 1) * batch size, output dim]

In [20]:
def evaluate(model, data_loader, criterion):
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src, src_len, trg, trg_len = batch
            src = src.to(device)
            trg = trg.to(device)

            output = model(src, trg, 0)
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(data_loader)

In [21]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

In [22]:
N_EPOCHS = 25
CLIP = 5

best_train_loss = float('inf')
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP, epoch)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if train_loss < best_train_loss:
        best_train_loss = train_loss
        torch.save(model.state_dict(), './models/s2s/train.loss.best.pt')
        print(f'Epoch: {epoch+1:02} | train.loss.best: {epoch+1}')
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/s2s/valid.loss.best.pt')
        print(f'Epoch: {epoch+1:02} | valid.loss.best: {epoch+1}')
        
    lr_scheduler.step(valid_loss)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.4f} | Train PPL: {math.exp(train_loss):8.4f}')
    print(f'\t Val. Loss: {valid_loss:.4f} |  Val. PPL: {math.exp(valid_loss):8.4f}')
    print("#"*100)

| epoch   1 |     1/  351 batches | loss 7.1277
| epoch   1 |   101/  351 batches | loss 4.2795
| epoch   1 |   201/  351 batches | loss 4.2831
| epoch   1 |   301/  351 batches | loss 3.9870
Epoch: 01 | train.loss.best: 1
Epoch: 01 | valid.loss.best: 1
Epoch: 01 | Time: 0m 25s
	Train Loss: 4.2795 | Train PPL:  72.2058
	 Val. Loss: 4.5660 |  Val. PPL:  96.1544
####################################################################################################
| epoch   2 |     1/  351 batches | loss 4.0984
| epoch   2 |   101/  351 batches | loss 3.7460
| epoch   2 |   201/  351 batches | loss 4.0170
| epoch   2 |   301/  351 batches | loss 4.0312
Epoch: 02 | train.loss.best: 2
Epoch: 02 | Time: 0m 24s
	Train Loss: 3.9178 | Train PPL:  50.2901
	 Val. Loss: 4.5894 |  Val. PPL:  98.4327
####################################################################################################
| epoch   3 |     1/  351 batches | loss 3.9413
| epoch   3 |   101/  351 batches | loss 3.8912
| epoch

| epoch  19 |   201/  351 batches | loss 3.3776
| epoch  19 |   301/  351 batches | loss 3.2502
Epoch: 19 | train.loss.best: 19
Epoch: 19 | Time: 0m 25s
	Train Loss: 3.4585 | Train PPL:  31.7687
	 Val. Loss: 4.5301 |  Val. PPL:  92.7662
####################################################################################################
| epoch  20 |     1/  351 batches | loss 3.3811
| epoch  20 |   101/  351 batches | loss 3.5513
| epoch  20 |   201/  351 batches | loss 3.2378
| epoch  20 |   301/  351 batches | loss 3.4636
Epoch    20: reducing learning rate of group 0 to 3.9063e-06.
Epoch: 20 | Time: 0m 25s
	Train Loss: 3.4602 | Train PPL:  31.8228
	 Val. Loss: 4.5312 |  Val. PPL:  92.8677
####################################################################################################
| epoch  21 |     1/  351 batches | loss 3.3594
| epoch  21 |   101/  351 batches | loss 3.6014
| epoch  21 |   201/  351 batches | loss 3.3225
| epoch  21 |   301/  351 batches | loss 3.6279
Epoch:

In [23]:
model.load_state_dict(torch.load('./models/s2s/train.loss.best.pt'))

test_loss = evaluate(model, valid_loader, criterion)

print(f'| Test Loss: {test_loss:.4f} | Test PPL: {math.exp(test_loss):8.4f} |')

| Test Loss: 4.5272 | Test PPL:  92.5031 |


## 5. Inference

In [24]:
def inference(model, q_sent="", a_sent=None, char2index=None, index2char=None):
    
    model.eval()
    with torch.no_grad():
        
        src = [char2index.get(SOS_TOKEN)]
        src += [char2index.get(token, UNK_TOKEN_ID) for token in q_sent]
        src += [char2index.get(EOS_TOKEN)]
        
        trg = [char2index.get(SOS_TOKEN)]
        trg += [char2index.get(token, UNK_TOKEN_ID) for token in a_sent]
        trg += [char2index.get(EOS_TOKEN)]

        src = torch.LongTensor([src]).to(device)
        trg = torch.LongTensor([trg]).to(device)

        hyp_ys = model(src, trg, 1)
        pred = torch.argmax(hyp_ys[0], dim=-1).detach().cpu().numpy()
        
        pred_sent = [index2char[token_id] for token_id in pred[1:]]
        pred_sent = ''.join(pred_sent)
        
        print(pred_sent)

In [25]:
idx = np.random.randint(len(train_dataset), size=1)[0]

print("random idx : ", idx)

q_sent = train_dataset.data[idx][0]
a_sent = train_dataset.data[idx][1]

print("Q: ", q_sent)
print("A: ", a_sent)

inference(model, q_sent, a_sent, char2index=train_dataset.char2index, index2char=train_dataset.index2char)

random idx :  2732
Q:  사랑이 밥 먹여주나
A:  사랑이 밥은 먹여주지 않지만 행복을 줘요.
그랑     사어 세 않아  좋복해 해보.<eos>
