<a href="https://colab.research.google.com/github/ugonfor/Kr-CharRNN-pytorch/blob/main/Korean_Char_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Korean Char RNN**
Char RNN for **Hangul**(Korean Letter) implemented in Pytorch.

I use [hangul-toolkit](https://github.com/bluedisk/hangul-toolkit) to parse the hangul. Then, I embedding the hangul as a vector.


## Training Data
데이터셋으로는 소나기(황순원)을 사용하였습니다.
구글링을 해보면 쉽게 구할 수 있습니다.



##Module
한글을 문자단위 RNN을 할 때는 영어와 바르게 한글의 경우 자모가 있기에, 자음 모음 단위로 Char RNN을 생성하는 것이 맞다고 생각했습니다.

Github에 찾아보면 한글 텍스트를 자음모음으로 분리해주는 프로젝트를 찾아볼 수 있고, 저는 hangul-toolkit을 사용하였습니다.


In [3]:
!pip install hgtk

Collecting hgtk
  Downloading https://files.pythonhosted.org/packages/79/04/04758ed8c086fb1d9a5a267f90239533d33dbc1646ac32f8bf80e38b0ec7/hgtk-0.1.3.tar.gz
Building wheels for collected packages: hgtk
  Building wheel for hgtk (setup.py) ... [?25l[?25hdone
  Created wheel for hgtk: filename=hgtk-0.1.3-py2.py3-none-any.whl size=6689 sha256=52c37ba60d6a8f5b847b1141f8d2c668ea5af1586d1eea279660e642be26d03d
  Stored in directory: /root/.cache/pip/wheels/73/72/06/6065a57fe68264f35d7e52e37f56831eb3e9ec75656880de20
Successfully built hgtk
Installing collected packages: hgtk
Successfully installed hgtk-0.1.3


In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable


device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, rnn_model='lstm', n_layer=1):
        super(CharRNN, self).__init__()

        # 변수 정의
        self.rnn_model = rnn_model.lower() # 모델 구분
        self.input_size = input_size # input char의 vector size. 즉, 한글 dataset의 크기
        self.hidden_size = hidden_size # hidden layer에서의 vector size
        self.output_size = output_size # output의 size (CharRNN은 input set과 output set이 동일하여 size가 동일할 듯)

        self.n_layer = n_layer # for what?

        # 모델 구성하는 데 필요한 부분
        # nn.Embedding을 통해서 계산을 줄였음.
        self.encoder = nn.Embedding(input_size, hidden_size)
        if self.rnn_model == "gru":
            self.rnn = nn.GRU(hidden_size, hidden_size, n_layer)
        elif self.rnn_model == "lstm":
            self.rnn = nn.LSTM(hidden_size, hidden_size, n_layer)
        self.decoder = nn.Linear(hidden_size, output_size, bias=True)


    def forward(self, _input, hidden):
        # _input은 [batch size, 자모분리된 한글 seq -> (len(글자길이) * 3)]
        batch_size = _input.size(0) # text length

        # hidden_size 만큼의 dim을 가지는 vector 로 변경되었음.
        # 차원 변화 : [batch_size, 자모 분리 seq] -> [batch_size, 자모 분리 seq, embedding 사이즈]
        encoded = self.encoder(_input) 

        # gru의 경우 init hidden이 1개
        # lstm의 경우 init hidden이 hidden과 cell로 2개
        output, hidden = self.rnn(encoded, hidden)

        # 다시 embdding 차원에서 hangul_set으로
        output = self.decoder(output)

        return output, hidden
        
    def init_hidden(self, batch_size):
        #for lstm
        if self.rnn_model == "lstm":
            return (Variable(torch.zeros(self.n_layer, batch_size, self.hidden_size)).to(device),
                    Variable(torch.zeros(self.n_layer, batch_size, self.hidden_size)).to(device))
        
        #for gru
        return Variable(torch.zeros(self.n_layer, batch_size, self.hidden_size)).to(device)

In [32]:
import torch
import torch.nn as nn
import hgtk

hangul_set = ['']
hangul_set += list("ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㄳㄵㄶㄺㄻㄼㄽㄾㄿㅀㅄㅏㅑㅓㅕㅗㅛㅜㅠㅡㅣㅐㅒㅔㅖㅘㅙㅚㅝㅞㅟㅢ\"\' .?!,\n1234567890")


hangul_num = len(hangul_set)

def hangul_one_hot_tensor(string):
    tensor = torch.zeros(3 * len(string) * hangul_num).long()
    tensor = tensor.view(-1,3,len(hangul_set))
    
    for idx, char in enumerate(string):
        # 각각의 한글을 one-hot vector로 생성
        try:
            #자모 분리
            char_list = hgtk.letter.decompose(char)
            char_list = list(char_list)
            #분리 후
            tensor[idx][0][hangul_set.index(char_list[0])], tensor[idx][1][hangul_set.index(char_list[1])], tensor[idx][2][hangul_set.index(char_list[2])] = 1,1,1 #hangul_set.index(char_list[1]), hangul_set.index(char_list[2])

        except:
            #한글이 아닌 경우
            tensor[idx][0][hangul_set.index(char)] = 1
            pass
    
    return tensor

def hangul_int_tensor(string):
    tensor = torch.zeros(3 * len(string)).long()
    tensor = tensor.view(-1,3)
    
    for idx, char in enumerate(string):
        # 각각의 한글을 one-hot vector로 생성
        try:
            #자모 분리
            char_list = hgtk.letter.decompose(char)
            char_list = list(char_list)
            #분리 후
            tensor[idx][0], tensor[idx][1], tensor[idx][2] = hangul_set.index(char_list[0]), hangul_set.index(char_list[1]), hangul_set.index(char_list[2])

        except:
            #한글이 아닌 경우
            tensor[idx][0] = hangul_set.index(char)
            pass
    
    return tensor

if __name__=="__main__":
    print(hangul_one_hot_tensor("단발 머리를 나풀거리며 소녀가 막 달린다. 갈밭 사잇길로 들어섰다. 뒤에는 청량한 가을 햇살 아래 빛나는 갈꽃뿐."))
    print(f'size() : {hangul_one_hot_tensor("단발 머리를 나풀거리며 소녀가 막 달린다. 갈밭 사잇길로 들어섰다. 뒤에는 청량한 가을 햇살 아래 빛나는 갈꽃뿐.").size()}')
    
    print(hangul_int_tensor("단발 머리를 나풀거리며 소녀가 막 달린다. 갈밭 사잇길로 들어섰다. 뒤에는 청량한 가을 햇살 아래 빛나는 갈꽃뿐."))
    print(f'size() : {hangul_int_tensor("단발 머리를 나풀거리며 소녀가 막 달린다. 갈밭 사잇길로 들어섰다. 뒤에는 청량한 가을 햇살 아래 빛나는 갈꽃뿐.").size()}')
    

    embeddinglayer = nn.Embedding(hangul_num, 3)
    print(embeddinglayer.weight)
    print(f'size() : {embeddinglayer.weight.size()}')

    print(hangul_int_tensor("아아").size())
    print(embeddinglayer(hangul_int_tensor("아아")))



tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 1,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]])
size() : torch.Size([63, 3, 70])
tensor([[ 3, 31,  2],
        [ 6, 31,  4],
        [54,  0,  0],
        [ 5, 33,  0],
        [ 4, 40,  0],
        [ 4, 39,  4],
        [54,  0,  0],
        [ 2, 31,  0],
        [13, 37,  4],
        [ 1, 33,  0],
        [ 4, 40,  0],
        [ 5, 34,  0],
        [54,  0,  0],
        [ 7, 35,  0],
   

In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.autograd import Variable

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def generate_text(_model, start_str, predict_len, cuda=False):
    hidden = _model.init_hidden(3) # batch size = 3 (당연히! 자모받힘까지 3개니까)
    start_input = Variable(hangul_int_tensor(start_str).unsqueeze(0))
    start_input = start_input.to(device)
    print(start_str)
    predicted = start_str
    
    #hidden state를 초기 생성
    for p in range(len(start_input) -1):
        _, hidden = _model(start_input[:,p],hidden)
    
    _input = start_input[:,-1]
    for p in range(predict_len):
        output, hidden = _model(_input, hidden)

        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(3,-1).div(0.8).exp() #한 글자 한글자가 70 * 3 이니까.
        #print(output_dist)
        top_1 = torch.multinomial(output_dist[0], 1)[0] # most significant value ! 자음
        top_2 = torch.multinomial(output_dist[1], 1)[0] # most significant value ! 모음
        top_3 = torch.multinomial(output_dist[2], 1)[0] # most significant value ! 받힘
         
        #print(top_1)
        #print(top_2)
        #print(top_3)
        
        # Add predicted character to string and use as next input   
        predicted_1 = hangul_set[top_1] # 자음
        predicted_2 = hangul_set[top_2] # 모음
        predicted_3 = hangul_set[top_3] # 받힘

        try:
            predicted_char = hgtk.letter.compose(*[predicted_1, predicted_2, predicted_3])
        except hgtk.exception.NotHangulException as e:
            predicted_char = predicted_1 + predicted_2 + predicted_3

        predicted += predicted_char

        _input = Variable(torch.tensor([top_1,top_2,top_3]).unsqueeze(0))
        _input = _input.to(device)

    return predicted


In [51]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm_notebook
from torch.autograd import Variable

import random
import time


#load hangul Dataset
hangul_data = open("data.txt", "rt", encoding='utf-8')
hangul_data = hangul_data.read()

# variable set
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 100
chunk_len = 100

#model
model = CharRNN(input_size=hangul_num, hidden_size=10, output_size=hangul_num).to(device=device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# _chunk_len : 글자 개수 (문장 단위)
# _batch_size : 문장의 개수
def generate_traning_set(_chunk_len, _batch_size):
    _input = torch.LongTensor(_batch_size, 3*_chunk_len)
    _target = torch.LongTensor(_batch_size, 3*_chunk_len)

    for idx in range(_batch_size):
        _start_index = random.randint(0, len(hangul_data) - _chunk_len)
        _end_index = _start_index + _chunk_len + 1
        chunk = hangul_data[_start_index:_end_index]
        
        #print(hangul_int_tensor(chunk[:-1]).view(-1))
        _input[idx] = hangul_int_tensor(chunk[:-1]).view(-1)
        _target[idx] = hangul_int_tensor(chunk[1:]).view(-1)

    _input = Variable(_input).to(device)
    _target = Variable(_target).to(device)

    return _input, _target


def train(_input, _target):
    hidden = model.init_hidden(batch_size)

    model.zero_grad()
    loss = 0

    for idx in range(chunk_len):
        output, hidden = model(_input[:,idx].view(1,-1), hidden) # batch, RNN
        loss += criterion(output.view(batch_size, -1), _target[:,idx])
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.data / chunk_len


def main_train(epochs):
    print(f'Device : {device}')
    start = time.time()
    loss_total = 0

    print("Training for %d epochs..." % epochs)
    model.train()
    for epoch in tqdm_notebook(range(1, epochs + 1)):
        loss = train(*generate_traning_set(chunk_len, batch_size))
        loss_total += loss

        print(f'\ntime : {time.time() - start :>10.2f}    epoch : {epoch} ({epoch/epochs * 100:>10.2f} % ) loss : {loss:>10.2f}')
    print(f'Training Done!')

def main_eval():
    print('Evaluate the model')
    model.eval()
    print(generate_text(model, "와", 100))


if __name__ == "__main__" :
    main_train(3000)
    main_eval()

Device : cuda
Training for 3000 epochs...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
time :     372.99    epoch : 503 (     16.77 % ) loss :       1.77

time :     373.73    epoch : 504 (     16.80 % ) loss :       1.78

time :     374.47    epoch : 505 (     16.83 % ) loss :       1.77

time :     375.20    epoch : 506 (     16.87 % ) loss :       1.77

time :     375.95    epoch : 507 (     16.90 % ) loss :       1.75

time :     376.66    epoch : 508 (     16.93 % ) loss :       1.79

time :     377.41    epoch : 509 (     16.97 % ) loss :       1.77

time :     378.16    epoch : 510 (     17.00 % ) loss :       1.78

time :     378.89    epoch : 511 (     17.03 % ) loss :       1.78

time :     379.62    epoch : 512 (     17.07 % ) loss :       1.76

time :     380.36    epoch : 513 (     17.10 % ) loss :       1.77

time :     381.11    epoch : 514 (     17.13 % ) loss :       1.76

time :     381.83    epoch : 515 (     17.17 % ) loss :       1.75

time :     382.54    epoch : 516 (     17.20 % ) loss :       1.77