### Recurrent Neural Network Language Modeling (RNNLM)

#### 목표: 한국어 철자(또는 음절) 인식 단위 언어모델링 
 - P('안녕하세요') = 
     P('안') * 
     P('녕'|'안') * 
     P('하'|'안', '녕') * 
     P('세'|'안', '녕', '하') * 
     P('요'|'안', '녕', '하', '세')

### 0. 데이터
 - 출처: https://github.com/eagle705/pytorch-transformer-chatbot/data_in


In [1]:
# Out-Of-Vocabulary (OOV) // UNKNOWN (OOV, UNK)

- 인식 단위: 철자(음절, Syllable) // ('가', '갸', '겨', ...)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import time

from torch.utils.data import Dataset
from tqdm import tqdm
from functools import lru_cache

In [3]:
import os

os.environ["CUDA_VISIBLE_DEVICES"]="0"
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("device: {}".format(device))

device: cuda


In [4]:
random.seed(777)
torch.manual_seed(777)

if device == 'cuda':
    torch.cuda.manual_seed_all(777)

In [5]:
# with open("./data/train.txt", 'w', encoding='utf-8') as f_w:
#     with open('./data/valid_train.txt', encoding='utf-8') as f:
#         for line in f:
#             pair = line.strip().split('\t')
#             # print(pair[0])
#             # print(pair[1])
#             f_w.write("{}\n".format(pair[0]))
#             f_w.write("{}\n".format(pair[1]))

### 0. 인식 단위 VOCAB 생성

In [6]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

In [7]:
import json

def create_vocab(train_text_file, valid_text_file):
    vocab = set()
    
    with open(train_text_file, encoding='utf-8') as f:
        for line in f:
            sent = line.strip() # "안녕하세요"
            
            for char in sent:
                vocab.add(char)
            
    with open(valid_text_file, encoding='utf-8') as f:
        for line in f:
            sent = line.strip()
            
            for char in sent:
                vocab.add(char)
    
    # print(vocab)
    
    vocab_list = sorted(vocab)
    # print(vocab_list)
    
    vocab = {PAD_token: "<pad>", SOS_token: "<sos>", EOS_token: "<eos>"}
    
    for idx, v in enumerate(vocab_list):
        vocab[int(idx+3)] = v
        # print(int(idx+3), v)
    
    with open('./data/vocab.json', 'w', encoding='utf-8') as f:
        f.write(json.dumps(vocab, ensure_ascii=False))
    
create_vocab('./data/train.txt', './data/valid.txt')

### 1. DATASET & DATALOADER

In [8]:
class TextDataset(Dataset):    
    def __init__(self, text_path, vocab_path):
        self.text_path = text_path
        self.vocab_path = vocab_path
        
        self.data = []
        
        self._read_vocab()
        self._read_text()
        
    def _read_vocab(self):
        with open(self.vocab_path, encoding='utf-8') as f:
            self.vocab = json.load(f)
    
    def _read_text(self):
        with open(self.text_path, encoding='utf-8') as f:
            for line in f:
                sent = line.strip()
                self.data.append(sent)
                
    @property
    def vocab_size(self):
        return len(self.vocab)
                
    @property
    def class_to_idx(self):
        return {_class: _cid for _cid, _class in self.vocab.items()}
    
    @property
    def idx_to_class(self):
        return {_cid: _class for _cid, _class in self.vocab.items()}
                
                
    def __len__(self):
        return len(self.data)
    
    @lru_cache(maxsize=100000)
    def __getitem__(self, index):
        sent = []
        sent += [int(self.class_to_idx['<sos>'])] 
        sent += [int(self.class_to_idx[char]) for char in self.data[index]]
        sent += [int(self.class_to_idx['<eos>'])]         
        
        src = sent[:-1]
        tgt = sent[1:]
        
#         print("sent: ", sent)
#         print("src : ", src)
#         print("tgt : ", tgt)
        
        return torch.LongTensor(src), torch.LongTensor(tgt)
        
    @classmethod
    def text_collate_fn(cls, batch):
        # batch = [ s1(src, tgt), s2(src, tgt), s3(src, tgt)]
        batch_X = [x for x, y in batch]        
        padded_batch_X = torch.nn.utils.rnn.pad_sequence(batch_X, batch_first=True)
#         print("batch_X: \n{}".format(batch_X))
#         print()
#         print("padded_batch_X: \n{}".format(padded_batch_X))
#         print()
        
        batch_Y = [y for x, y in batch]
        padded_batch_Y = torch.nn.utils.rnn.pad_sequence(batch_Y, batch_first=True)
        
        return padded_batch_X, padded_batch_Y  

In [9]:
train_dataset = TextDataset(text_path='./data/train.txt', vocab_path='./data/vocab.json')
valid_dataset = TextDataset(text_path='./data/valid.txt', vocab_path='./data/vocab.json')

In [10]:
print(train_dataset.data[0])
print(train_dataset[0])
print()
print(train_dataset.data[1])
print(train_dataset[1])

헤어지고 차단하는 이유는 뭘까?
(tensor([   1, 1194,  772,  929,   93,    3,  974,  283, 1173,  266,    3,  856,
         846,  266,    3,  528,  144,   21]), tensor([1194,  772,  929,   93,    3,  974,  283, 1173,  266,    3,  856,  846,
         266,    3,  528,  144,   21,    2]))

연락하지 않는 게 서로에게 좋으니가요.
(tensor([   1,  793,  399, 1173,  929,    3,  755,  266,    3,   77,    3,  655,
         443,  785,   77,    3,  908,  850,  273,   48,  824,    9]), tensor([ 793,  399, 1173,  929,    3,  755,  266,    3,   77,    3,  655,  443,
         785,   77,    3,  908,  850,  273,   48,  824,    9,    2]))


In [11]:
batch_size = 2

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           collate_fn=TextDataset.text_collate_fn,
                                           drop_last=False)

valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                           batch_size=1,
                                           shuffle=False,
                                           collate_fn=TextDataset.text_collate_fn,
                                           drop_last=False)

In [12]:
for X, Y in train_loader:
    print("X      : {}".format(X.numpy()))
    print()
    
    for sent in X.numpy():
        for char_idx in sent:
            print("{} ".format(train_dataset.idx_to_class[str(char_idx)]), end="")
        print()

    break

X      : [[   1  647  443  830    3  309  856 1117    3 1049  696  464    3  983
   751  577  666  824    9]
 [   1  128  418    3  637  402  345    3  851  130    3  480  751  824
     9    0    0    0    0]]

<sos> 새 로 운   데 이 트   코 스 를   찾 아 보 세 요 . 
<sos> 그 런   사 람 들   은 근   많 아 요 . <pad> <pad> <pad> <pad> 


### 2. Model

In [13]:
class RNNLM(nn.Module):
    def __init__(self, vocab_size, word_vec_dim=128, hidden_size=128, n_layers=2, dropout_p=0.1):
        super(RNNLM, self).__init__()
        
        self.embed = nn.Embedding(num_embeddings=vocab_size, 
                                  embedding_dim=word_vec_dim,
                                  padding_idx=PAD_token)

        self.rnn = nn.LSTM(word_vec_dim,
                           hidden_size,
                           num_layers=n_layers,
                           dropout=dropout_p,
                           bidirectional=False,
                           batch_first=True)
        
        rnn_output_dim = 2*hidden_size if self.rnn.bidirectional else hidden_size       
        self.output = nn.Linear(rnn_output_dim, vocab_size)
        
    def forward(self, x):
        x = self.embed(x)        
        y, h = self.rnn(x)       
        y = self.output(y)

        return y

In [14]:
model = RNNLM(vocab_size=train_dataset.vocab_size,
              word_vec_dim=128,
              hidden_size=256,
              n_layers=2,
              dropout_p=0.1).to(device)

print(model)

RNNLM(
  (embed): Embedding(1245, 128, padding_idx=0)
  (rnn): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.1)
  (output): Linear(in_features=256, out_features=1245, bias=True)
)


In [15]:
learning_rate = 0.001

criterion = torch.nn.CrossEntropyLoss(reduction='sum').to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01)
# lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.98)

### 3. Training

In [16]:
Y = torch.Tensor([1, 2, 3, 4, 0, 0, 0])
mask = (Y != 0)

print(mask)
print()

Y2 = Y[mask]
print(Y2)

tensor([ True,  True,  True,  True, False, False, False])

tensor([1., 2., 3., 4.])


In [17]:
training_epochs = 10

total_batch = len(train_loader)
# print(total_batch)

start_time = time.time()

print('Learning started.')

model.train()

for epoch in range(training_epochs):    
    total_loss = 0.
    total_num = 0
    
    for i, (X, Y) in enumerate(train_loader):
        
        # print(X.size(), Y.size(), type(Y))
        
        X = X.to(device)
        Y = Y.to(device)

        optimizer.zero_grad()
        hypothesis = model(X)
        # print(hypothesis.size())
        
        y_pred = hypothesis.contiguous().view(-1, hypothesis.size(-1))
        target = Y.contiguous().view(-1)
        real_value_index = [target != PAD_token]

        loss = criterion(y_pred[real_value_index], target[real_value_index])
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        #print("grad_norm: {:.8f}".format(grad_norm))
        
        optimizer.step()
        
        total_loss += loss.item()
        total_num += target[real_value_index].size(0)
        
        # print(total_loss, total_num, avg_loss)
        
        if epoch == 0:
            secs = int(time.time() - start_time)
            et = int((len(train_loader) - i) * (secs/(i+1)))
            print("(train) : {:4d}/{:4d} # estimated end time: {:10d} (sec.)".format(i+1, len(train_loader), et), end="\r", flush=True)
        
    avg_loss = total_loss / total_num
    perplexity  = np.exp(avg_loss)
    
    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | ", "time in %d minutes, %d seconds" %(mins, secs))
    print('Epoch: %d' %(epoch + 1), " | ", f'(train)\tAvg.Loss: {avg_loss:.8f}\t\tPPL: {perplexity:.8f}')
    
    model_fname = "./model/model.ep_{:03d}.pt".format(epoch+1)
    torch.save(model.state_dict(), model_fname)
    
    print("model saved as file name: {}".format(model_fname))
    print()
    
print('Learning finished')

Learning started.
Epoch: 1  |  time in 1 minutes, 42 seconds:          0 (sec.)
Epoch: 1  |  (train)	Avg.Loss: 3.94503719		PPL: 51.67825983
model saved as file name: ./model/model.ep_001.pt

Epoch: 2  |  time in 2 minutes, 44 seconds
Epoch: 2  |  (train)	Avg.Loss: 3.86377718		PPL: 47.64497538
model saved as file name: ./model/model.ep_002.pt

Epoch: 3  |  time in 3 minutes, 47 seconds
Epoch: 3  |  (train)	Avg.Loss: 3.85778780		PPL: 47.36046475
model saved as file name: ./model/model.ep_003.pt

Epoch: 4  |  time in 4 minutes, 49 seconds
Epoch: 4  |  (train)	Avg.Loss: 3.85498315		PPL: 47.22782125
model saved as file name: ./model/model.ep_004.pt

Epoch: 5  |  time in 5 minutes, 51 seconds
Epoch: 5  |  (train)	Avg.Loss: 3.84891557		PPL: 46.94212993
model saved as file name: ./model/model.ep_005.pt

Epoch: 6  |  time in 6 minutes, 52 seconds
Epoch: 6  |  (train)	Avg.Loss: 3.84428895		PPL: 46.72544821
model saved as file name: ./model/model.ep_006.pt

Epoch: 7  |  time in 7 minutes, 54 seco

In [18]:
model_fname = "./model/model.ep_{:03d}.pt".format(10)
state = torch.load(model_fname, map_location=torch.device('cpu'))
model.load_state_dict(state)

<All keys matched successfully>

### 4. Evaluate

In [19]:
model.eval()

with torch.no_grad():
    total_loss = 0.
    total_num = 0    
    
    for i, (X, Y) in enumerate(valid_loader):
        X = X.to(device)
        Y = Y.to(device)

        hypothesis = model(X)
        
        y_pred = hypothesis.contiguous().view(-1, hypothesis.size(-1))
        target = Y.contiguous().view(-1)
        real_value_index = [target != PAD_token]
        
        loss = criterion(y_pred[real_value_index], target[real_value_index])
        
        total_loss += loss.item()
        total_num += target[real_value_index].size(0)
        
    avg_loss = total_loss / total_num
    perplexity  = np.exp(avg_loss)
    
    print('=' * 89)
    print('| End of training | valid loss {:5.2f} | test ppl {:8.2f}'.format(avg_loss, perplexity))
    print('=' * 89)

| End of training | valid loss  3.83 | test ppl    46.17


In [20]:
c2i = train_dataset.class_to_idx

def get_prob_sent(sent=""):
    sent_id = []
    sent_id += [int(c2i['<sos>'])] 
    sent_id += [int(c2i[char]) for char in sent]
    # sent_id += [int(c2i['<eos>'])] 

    src = torch.LongTensor([sent_id[:-1]])
    tgt = torch.LongTensor([sent_id[1:]])
    
    X = src.to(device)
    Y = tgt.to(device)
    
    # print(X.size())
    
    model.eval()
    
    with torch.no_grad():
        hypothesis = model(X)
    
    # print(hypothesis.size(), hypothesis)
    
    last_char = sent[-1]
    last_char_id = int(c2i[last_char])
    # print(last_char_id)
    
    hypothesis = torch.nn.functional.softmax(hypothesis, dim=-1)
    
    # hypothesis.shape = (BS, SeqLen, Vocab)
    
    prob = hypothesis[:, -1, last_char_id].item()
    
    print("{} : {:.8f}".format(sent, prob))
    print()

In [21]:
get_prob_sent(sent="안녕하세요")
get_prob_sent(sent="여기는 어디인가요")
get_prob_sent(sent="헤어지고 차단하는 ")

안녕하세요 : 0.73909706

여기는 어디인가요 : 0.16988163

헤어지고 차단하는  : 0.78526711

