<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/AIhub_translation/aihub_ko_eng_translate_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

In [2]:
!pip install konlpy evaluate sacrebleu



In [3]:
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
import warnings

import math

from sklearn.model_selection import train_test_split

from konlpy.tag import Hannanum

import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch import Tensor
from torch.nn import Transformer

import evaluate

warnings.filterwarnings('ignore')

In [4]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

seed_everything(42)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
cd "/content/drive/MyDrive/Data/aihub_kor_eng_translation"

/content/drive/MyDrive/Data/aihub_kor_eng_translation


## Load data and concat

AI hub 샘플 데이터에는 6개의 데이터 리스트가 있고 그중에서 문어체로 구성된 데이터로 프로젝트를 진행, 한국어 이름으로 된 데이터를 source 데이터 이름을 영어로 변경
-  '3_문어체_뉴스_190920.xlsx' -> 3_news.xlsx
-  '4_문어체_한국문화_190920.xlsx' -> 4_korean_culture.xlsx
-  '5_문어체_조례_190920.xlsx' -> 5_ordinance.xlsx
-  '6_문어체_지자체웹사이트_190920.xlsx' -> 6_local_govweb.xlsx

In [7]:
# ## concatenate lists of all the data ###
# files = ['3_news.xlsx', '4_korean_culture.xlsx', '5_ordinance.xlsx', '6_local_govweb.xlsx']
# df = pd.DataFrame()
# for idx, file in enumerate(files):
#   data = pd.read_excel('./'+file)
#   data['source'] = idx # data source 추가
#   data.columns = data.columns.str.lower() # 번역문 중에서 review를 사용
#   data= data[['원문','review','source']]
#   df = pd.concat([df, data], axis=0)
# total = pd.DataFrame(df)
# total.columns =['korean','english','source']
# display(total.head())
# print(total.shape)

# total.to_csv('./total.csv', index=False)

In [8]:
data = pd.read_csv('./total.csv')
data.head()

Unnamed: 0,korean,english,source
0,‘ZKZM-500 레이저 공격용 소총’으로 명명된 이 무기는 15㎜ 구경의 소총이며...,"Named the 'ZKZM-500 Laser Attack Rifle', the w...",0
1,“너희는 세상의 소금이니 소금이 만일 그 맛을 잃으면 무엇으로 짜게 하리요 후에는 ...,"""You are the salt of the earth. But if the sal...",0
2,“너희는 이 세대를 본받지 말고 오직 마음을 새롭게 함으로 변화를 받아 하나님의 선...,"""Do not conform to the pattern of this world, ...",0
3,“너희는 주께 받은바 기름 부음이 너희 안에 거하나니 아무도 너희를 가르칠 필요가 ...,"It is written as ""As for you, the anointing yo...",0
4,“너희는 택하신 족속이요 왕 같은 제사장들이요 거룩한 나라요….”(벧전 2:9) 따...,"""But you are a chosen people, a royal priestho...",0


## Making vocabulary

In [9]:
source_dataset = list(data['korean']) # source data
target_dataset = list(data['english']) # target_dataset

In [10]:
kor_tokenizer = get_tokenizer(Hannanum().morphs)
eng_tokenizer = get_tokenizer('spacy', language='en')

In [11]:
def vocab_iterator(strings, tokenizer):
  for _string in tqdm(strings):
    yield tokenizer(_string)

In [12]:
kor_vocab = build_vocab_from_iterator(vocab_iterator(source_dataset, kor_tokenizer), specials=['<PAD>', '<UNK>', '<SOS>', '<EOS>'], min_freq=5)
kor_vocab.set_default_index(kor_vocab['<UNK>'])

100%|██████████| 44507/44507 [04:57<00:00, 149.46it/s]


In [13]:
eng_vocab = build_vocab_from_iterator(vocab_iterator(target_dataset, eng_tokenizer), specials=['<PAD>', '<UNK>', '<SOS>', '<EOS>'], min_freq=5)
eng_vocab.set_default_index(eng_vocab['<UNK>'])

100%|██████████| 44507/44507 [00:06<00:00, 7228.04it/s] 


## Making train, valid,  test dataset

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(data.index.values,
                                                    data.source.values,
                                                    test_size=0.3,
                                                    stratify=data.source.values,
                                                    random_state = 42)

In [15]:
data['data_type'] =['not_set'] * data.shape[0]

data.loc[X_train, 'data_type'] = 'train'
data.loc[X_valid, 'data_type'] = 'valid'

In [16]:
X_valid, X_test, y_valid, y_test = train_test_split(data[data['data_type']=='valid'].index.values,
                                                    data[data['data_type']=='valid'].source.values,
                                                    test_size=0.5,
                                                    stratify=data[data['data_type']=='valid'].source.values,
                                                    random_state = 42)

In [17]:
data.loc[X_valid, 'data_type'] = 'valid'
data.loc[X_test, 'data_type'] = 'test'
data.head()

Unnamed: 0,korean,english,source,data_type
0,‘ZKZM-500 레이저 공격용 소총’으로 명명된 이 무기는 15㎜ 구경의 소총이며...,"Named the 'ZKZM-500 Laser Attack Rifle', the w...",0,train
1,“너희는 세상의 소금이니 소금이 만일 그 맛을 잃으면 무엇으로 짜게 하리요 후에는 ...,"""You are the salt of the earth. But if the sal...",0,valid
2,“너희는 이 세대를 본받지 말고 오직 마음을 새롭게 함으로 변화를 받아 하나님의 선...,"""Do not conform to the pattern of this world, ...",0,train
3,“너희는 주께 받은바 기름 부음이 너희 안에 거하나니 아무도 너희를 가르칠 필요가 ...,"It is written as ""As for you, the anointing yo...",0,train
4,“너희는 택하신 족속이요 왕 같은 제사장들이요 거룩한 나라요….”(벧전 2:9) 따...,"""But you are a chosen people, a royal priestho...",0,test


In [18]:
# split train valid test dataset
train_set = data[data.data_type == 'train']
valid_set = data[data.data_type == 'valid']
test_set = data[data.data_type == 'test']

In [19]:
# # sampling the set for cheching code
# train_set = train_set.sample(n=2000)
# valid_set = valid_set.sample(n=500)

In [20]:
def data_process(source_dataset, target_dataset):
  data = []
  for (raw_kor, raw_eng) in tqdm(zip(source_dataset, target_dataset)):
    kor_tensor_ = torch.tensor([kor_vocab[token] for token in kor_tokenizer(raw_kor)], dtype=torch.long)
    eng_tensor_ = torch.tensor([eng_vocab[token] for token in eng_tokenizer(raw_eng)], dtype=torch.long)
    data.append((kor_tensor_, eng_tensor_))

  return data

In [21]:
train_dataset = data_process(train_set['korean'], train_set['english'])
valid_dataset = data_process(valid_set['korean'], valid_set['english'])
test_dataset = data_process(test_set['korean'], test_set['english'])

31154it [04:27, 116.47it/s]
6676it [00:57, 115.73it/s]
6677it [00:58, 114.82it/s]


In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu' )

In [23]:
# print(kor_vocab['<SOS>'], kor_vocab['<EOS>'],kor_vocab['<UNK>'],kor_vocab['<PAD>'])
# print(eng_vocab['<SOS>'], eng_vocab['<EOS>'],eng_vocab['<UNK>'],eng_vocab['<PAD>'])

In [24]:
PAD_IDX = kor_vocab['<PAD>']
UNK_IDX = kor_vocab['<UNK>']
SOS_IDX = kor_vocab['<SOS>']
EOS_IDX = kor_vocab['<EOS>']

In [25]:
def collate_fn(dataset):
  kor_batch, eng_batch = [], []
  for (kor_item, eng_item) in dataset:
    kor_batch.append(torch.cat([torch.tensor([SOS_IDX]), kor_item, torch.tensor([EOS_IDX])], dim=0))
    eng_batch.append(torch.cat([torch.tensor([SOS_IDX]), eng_item, torch.tensor([EOS_IDX])], dim=0))

  kor_batch = pad_sequence(kor_batch, padding_value=PAD_IDX)
  eng_batch = pad_sequence(eng_batch, padding_value=PAD_IDX)

  return kor_batch, eng_batch

In [26]:
BATCH_SIZE = 16

trainloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True, collate_fn= collate_fn )
validloader = DataLoader(valid_dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn= collate_fn )
testloader = DataLoader(test_dataset, batch_size = BATCH_SIZE, shuffle = False, collate_fn= collate_fn )

## Model class

In [27]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()

        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)

        pos = torch.arange(0, max_len).reshape(max_len, 1)

        pos_embedding = torch.zeros((max_len, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)

        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [28]:
class TokenEmbedding(nn.Module):
  def __init__(self, vocab_size, emb_size):
    super(TokenEmbedding, self).__init__()
    self.embedding = nn.Embedding(vocab_size, emb_size)
    self.emb_size = emb_size

  def forward(self, tokens:Tensor):
    return self.embedding(tokens.long())* math.sqrt(self.emb_size)

In [29]:
class Seq2SeqTransformer(nn.Module):
  def __init__(self,
               num_encoder_layers: int,
               num_decoder_layers: int,
               emb_size: int,
               num_head: int,
               src_vocab_size:int,
               tar_vocab_size:int,
               dim_feedforward:int = 512,
               dropout:float =0.1):
    super(Seq2SeqTransformer, self).__init__()

    self.transformer = Transformer(
        d_model=emb_size,
        nhead=num_head,
        num_encoder_layers=num_encoder_layers,
        num_decoder_layers=num_decoder_layers,
        dim_feedforward=dim_feedforward,
        dropout=dropout
        )
    self.generator = nn.Linear(emb_size, tar_vocab_size)
    self.src_token_emb = TokenEmbedding(src_vocab_size, emb_size)
    self.tar_token_emb = TokenEmbedding(tar_vocab_size, emb_size)
    self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

  def forward(self,
              src:Tensor,
              tar:Tensor,
              src_mask: Tensor,
              tar_mask: Tensor,
              src_padding_mask: Tensor,
              tar_padding_mask:Tensor,
              memory_key_padding_mask:Tensor
              ):
    src_emb = self.positional_encoding(self.src_token_emb(src))
    tar_emb = self.positional_encoding(self.tar_token_emb(tar))

    outs = self.transformer(src_emb, tar_emb, src_mask, tar_mask, None, src_padding_mask, tar_padding_mask, memory_key_padding_mask)

    return self.generator(outs)

  def encode(self, src:Tensor, src_mask:Tensor):
    return self.transformer.encoder(self.positional_encoding(self.src_token_emb(src)), src_mask)

  def decode(self, tar:Tensor, memory: Tensor, tar_mask:Tensor):
    return self.transformer.decoder(self.positional_encoding(self.tar_token_emb(tar)), memory, tar_mask)

In [30]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [31]:
def create_mask(src, tar):
  src_seq_len = src.shape[0]
  tar_seq_len = tar.shape[0]

  tar_mask = generate_square_subsequent_mask(tar_seq_len)
  src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)

  src_padding_mask = (src == PAD_IDX).transpose(0, 1)
  tar_padding_mask = (tar == PAD_IDX).transpose(0, 1)

  return src_mask, tar_mask, src_padding_mask, tar_padding_mask

In [32]:
kor_vocab_size = kor_vocab.__len__()
eng_vocab_size = eng_vocab.__len__()

embedding_dim = 512
hidden_dim = 512
num_heads = 8
batch_size = 128
encoder_layer_num = 3
decoder_layer_num = 3

In [33]:
print(f"kor_vocab_size is {kor_vocab.__len__()}")
print(f"eng_vocab_size is {eng_vocab.__len__()}")

kor_vocab_size is 13827
eng_vocab_size is 12299


In [34]:
model = Seq2SeqTransformer(encoder_layer_num,
                           decoder_layer_num,
                           embedding_dim,
                           num_heads,
                           kor_vocab_size,
                           eng_vocab_size,
                           hidden_dim
                           )

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

model = model.to(device)

In [35]:
LEARNING_RATE= 2e-5
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = torch.optim.AdamW(model.parameters(), lr = LEARNING_RATE, betas=(0.9, 0.98), eps=1e-9)

In [36]:
from nltk.translate.bleu_score import corpus_bleu

eng_itos = eng_vocab.get_itos()

def compute_metrics(predictions, target_outs):
  special_tokens =[PAD_IDX, EOS_IDX, SOS_IDX, UNK_IDX]
  prediction_sentences, target_sentences = [], []
  for i in range(len(predictions)):
    pred_sentence = [eng_itos[token]  for token in predictions[i] if token not in special_tokens]
    target_sentence = [eng_itos[token] for token in target_outs[i] if token not in special_tokens]
    prediction_sentences.append(pred_sentence)
    target_sentences.append(target_sentence)
  score = corpus_bleu(prediction_sentences, target_sentences,  weights=(1, 0, 0, 0))
  return score


In [40]:
class Trainer():
  def __init__(self, trainloader, validloader, testloader, model, optimizer, criterion, device):
    self.trainloader = trainloader
    self.validloader = validloader
    self.testloader = testloader
    self.model = model
    self.optimizer = optimizer
    self.criterion = criterion
    self.device = device

  def train(self, num_epochs=1):
    self.model.train()

    for epoch in range(1, num_epochs+1):
      print('===========Epoch:{:1d}/{:1d}==================='.format(epoch, num_epochs))
      running_loss = 0.0

      for src, tar in tqdm(self.trainloader):
        src = src.to(self.device)
        tar = tar.to(self.device)

        tar_input = tar[:-1, :]

        src_mask, tar_mask, src_padding_mask, tar_padding_mask = create_mask(src, tar_input)

        logits = self.model(src, tar_input, src_mask, tar_mask,src_padding_mask, tar_padding_mask, src_padding_mask)

        self.optimizer.zero_grad()

        tar_out = tar[1:, :]
        loss = self.criterion(logits.reshape(-1, logits.shape[-1]), tar_out.reshape(-1))
        loss.backward()

        self.optimizer.step()
        running_loss += loss.item()

      trian_loss = running_loss/len(self.trainloader)
      predicted_list, target_outs, valid_loss = self.validate()
      tqdm.write('Train Loss:{:.3f}, Validation Loss:{:.3f}, BLEU Score:{:.3f}'.
                 format(trian_loss, valid_loss, compute_metrics(predicted_list, target_outs)*100))


  def validate(self):
    self.model.eval()
    running_loss = 0.0
    predicted_list, target_outs =[], []

    for src, tar in tqdm(self.validloader):
      src = src.to(self.device)
      tar = tar.to(self.device)

      tar_input = tar[:-1, :]

      src_mask, tar_mask, src_padding_mask, tar_padding_mask = create_mask(src, tar_input)

      logits = self.model(src, tar_input, src_mask, tar_mask, src_padding_mask, tar_padding_mask, src_padding_mask)

      tar_out = tar[1:, :]
      loss = self.criterion(logits.reshape(-1, logits.shape[-1]), tar_out.reshape(-1))
      running_loss += loss.item()

      predicted = torch.argmax(logits, dim=-1).flatten().detach().cpu().numpy().tolist()
      predicted_list.append(predicted)
      tar_out = tar_out.flatten().detach().cpu().numpy().tolist()
      target_outs.append(tar_out)

    valid_loss = running_loss/len(self.validloader)

    return predicted_list, target_outs, valid_loss

  def test(self):
    self.model.eval()
    predicted_list, target_outs =[], []

    for src, tar in tqdm(self.testloader):
      src = src.to(self.device)
      tar = tar.to(self.device)

      tar_input = tar[:-1, :]

      src_mask, tar_mask, src_padding_mask, tar_padding_mask = create_mask(src, tar_input)

      logits = self.model(src, tar_input, src_mask, tar_mask, src_padding_mask, tar_padding_mask, src_padding_mask)

      tar_out = tar[1:, :]

      predicted = torch.argmax(logits, dim=-1).flatten().detach().cpu().numpy().tolist()
      predicted_list.append(predicted)
      tar_out = tar_out.flatten().detach().cpu().numpy().tolist()
      target_outs.append(tar_out)

    return predicted_list, target_outs

In [42]:
trainer = Trainer(trainloader= trainloader,
                  validloader = validloader,
                  testloader = testloader,
                  model = model,
                  optimizer = optimizer,
                  criterion = criterion,
                  device = device)
trainer.train(num_epochs=10)



100%|██████████| 1948/1948 [01:05<00:00, 29.79it/s]
100%|██████████| 418/418 [00:05<00:00, 69.89it/s]


Train Loss:5.149, Validation Loss:4.860, BLEU Score:1.465


100%|██████████| 1948/1948 [01:00<00:00, 32.46it/s]
100%|██████████| 418/418 [00:06<00:00, 63.31it/s]


Train Loss:4.619, Validation Loss:4.515, BLEU Score:1.603


100%|██████████| 1948/1948 [01:00<00:00, 32.27it/s]
100%|██████████| 418/418 [00:07<00:00, 55.18it/s]


Train Loss:4.324, Validation Loss:4.296, BLEU Score:1.699


100%|██████████| 1948/1948 [01:06<00:00, 29.48it/s]
100%|██████████| 418/418 [00:07<00:00, 54.84it/s]


Train Loss:4.104, Validation Loss:4.146, BLEU Score:1.769


100%|██████████| 1948/1948 [01:03<00:00, 30.45it/s]
100%|██████████| 418/418 [00:05<00:00, 72.79it/s]


Train Loss:3.923, Validation Loss:4.020, BLEU Score:1.817


100%|██████████| 1948/1948 [01:02<00:00, 31.32it/s]
100%|██████████| 418/418 [00:07<00:00, 52.28it/s]


Train Loss:3.767, Validation Loss:3.933, BLEU Score:1.870


100%|██████████| 1948/1948 [01:02<00:00, 30.95it/s]
100%|██████████| 418/418 [00:07<00:00, 59.44it/s]


Train Loss:3.622, Validation Loss:3.853, BLEU Score:1.884


100%|██████████| 1948/1948 [01:02<00:00, 31.16it/s]
100%|██████████| 418/418 [00:05<00:00, 74.74it/s]


Train Loss:3.487, Validation Loss:3.795, BLEU Score:1.901


100%|██████████| 1948/1948 [01:00<00:00, 31.95it/s]
100%|██████████| 418/418 [00:05<00:00, 72.52it/s]


Train Loss:3.357, Validation Loss:3.749, BLEU Score:1.925


100%|██████████| 1948/1948 [01:01<00:00, 31.85it/s]
100%|██████████| 418/418 [00:06<00:00, 68.14it/s]


Train Loss:3.231, Validation Loss:3.713, BLEU Score:1.949


In [43]:
predicted_list, target_outs = trainer.test()

100%|██████████| 418/418 [00:06<00:00, 61.64it/s]


In [46]:
# 100 is exact match
print('Test set BLEU Score: {:.3f}% '. format(compute_metrics(predicted_list, target_outs)*100))

Test set BLEU Score: 1.957% 


In [47]:
# special_tokens =[PAD_IDX, EOS_IDX, SOS_IDX, UNK_IDX]
# infer_refer_list = []
# for i in range(len(predicted_list)):
#   pred_sentence = ' '.join([eng_itos[token]  for token in predicted_list[i] if token not in special_tokens])
#   target_sentence = ' '.join([eng_itos[token] for token in target_outs[i] if token not in special_tokens])
#   infer_refer_list.append([pred_sentence, target_sentence])


In [51]:
# pd.DataFrame(infer_refer_list)