<a href="https://colab.research.google.com/github/sunny0103/DeepLearning_nlp_projects/blob/main/AIhub_translation/aihub_ko_eng_translate_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install  transformers[sentencepiece] accelerate  sacremoses evaluate sacrebleu --quiet

In [2]:
import pandas as pd
import numpy as np
import os, gc
import random
from tqdm import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline
    )

from datasets import Dataset

import evaluate

import torch

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

seed_everything(42)

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
cd "/content/drive/MyDrive/Data/aihub_kor_eng_translation"

/content/drive/MyDrive/Data/aihub_kor_eng_translation


AI hub 샘플 데이터에는 6개의 데이터 리스트가 있고 그중에서 문어체로 구성된 데이터로 프로젝트를 진행, 한국어 이름으로 된 데이터를 source 데이터 이름을 영어로 변경
-  '3_문어체_뉴스_190920.xlsx' -> 3_news.xlsx
-  '4_문어체_한국문화_190920.xlsx' -> 4_korean_culture.xlsx
-  '5_문어체_조례_190920.xlsx' -> 5_ordinance.xlsx
-  '6_문어체_지자체웹사이트_190920.xlsx' -> 6_local_govweb.xlsx


## Load and Preprocess the Data

In [6]:
data = pd.read_csv('./total.csv')
data.head()

Unnamed: 0,korean,english,source
0,‘ZKZM-500 레이저 공격용 소총’으로 명명된 이 무기는 15㎜ 구경의 소총이며...,"Named the 'ZKZM-500 Laser Attack Rifle', the w...",0
1,“너희는 세상의 소금이니 소금이 만일 그 맛을 잃으면 무엇으로 짜게 하리요 후에는 ...,"""You are the salt of the earth. But if the sal...",0
2,“너희는 이 세대를 본받지 말고 오직 마음을 새롭게 함으로 변화를 받아 하나님의 선...,"""Do not conform to the pattern of this world, ...",0
3,“너희는 주께 받은바 기름 부음이 너희 안에 거하나니 아무도 너희를 가르칠 필요가 ...,"It is written as ""As for you, the anointing yo...",0
4,“너희는 택하신 족속이요 왕 같은 제사장들이요 거룩한 나라요….”(벧전 2:9) 따...,"""But you are a chosen people, a royal priestho...",0


In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(data.index.values,
                                                    data.source.values,
                                                    test_size=0.3,
                                                    stratify=data.source.values,
                                                    random_state = 42)

In [8]:
data['data_type'] =['not_set'] * data.shape[0]

data.loc[X_train, 'data_type'] = 'train'
data.loc[X_valid, 'data_type'] = 'valid'

In [9]:
X_valid, X_test, y_valid, y_test = train_test_split(data[data['data_type']=='valid'].index.values,
                                                    data[data['data_type']=='valid'].source.values,
                                                    test_size=0.5,
                                                    stratify=data[data['data_type']=='valid'].source.values,
                                                    random_state = 42)

In [10]:
data.loc[X_valid, 'data_type'] = 'valid'
data.loc[X_test, 'data_type'] = 'test'
data.head()

Unnamed: 0,korean,english,source,data_type
0,‘ZKZM-500 레이저 공격용 소총’으로 명명된 이 무기는 15㎜ 구경의 소총이며...,"Named the 'ZKZM-500 Laser Attack Rifle', the w...",0,train
1,“너희는 세상의 소금이니 소금이 만일 그 맛을 잃으면 무엇으로 짜게 하리요 후에는 ...,"""You are the salt of the earth. But if the sal...",0,valid
2,“너희는 이 세대를 본받지 말고 오직 마음을 새롭게 함으로 변화를 받아 하나님의 선...,"""Do not conform to the pattern of this world, ...",0,train
3,“너희는 주께 받은바 기름 부음이 너희 안에 거하나니 아무도 너희를 가르칠 필요가 ...,"It is written as ""As for you, the anointing yo...",0,train
4,“너희는 택하신 족속이요 왕 같은 제사장들이요 거룩한 나라요….”(벧전 2:9) 따...,"""But you are a chosen people, a royal priestho...",0,test


In [11]:
# split train valid test dataset
train_set = data[data.data_type == 'train']
valid_set = data[data.data_type == 'valid']
test_set = data[data.data_type == 'test']

In [12]:
# train_set = train_set.sample(n=2000)
# valid_set = valid_set.sample(n=500)

In [13]:
train_set = train_set.reset_index(drop=True)
valid_set = valid_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

In [14]:
# change data from dataframe to data dict
train_set = Dataset.from_pandas(train_set)
valid_set = Dataset.from_pandas(valid_set)

In [15]:
train_set[0]

{'korean': '‘ZKZM-500 레이저 공격용 소총’으로 명명된 이 무기는 15㎜ 구경의 소총이며, 무게는 3㎏, 조준 사거리는 800m다.',
 'english': "Named the 'ZKZM-500 Laser Attack Rifle', the weapon is a 15-millimeter-caliber rifle, weighing 3 kilograms with a target range of 800 meters.",
 'source': 0,
 'data_type': 'train'}

In [16]:
MODEL_NAME = "Helsinki-NLP/opus-mt-ko-en"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [17]:
prefix = "translate Korean to English: "
max_len = 64
def preprocess_function(examples):
    inputs = [prefix + example for example in examples["korean"]]
    targets = [example for example in examples["english"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_len, truncation=True)
    return model_inputs

In [18]:
tokenized_trainset = train_set.map(
    preprocess_function,
    batched=True,
    remove_columns=train_set.column_names)

Map:   0%|          | 0/31154 [00:00<?, ? examples/s]

In [19]:
tokenized_validset = valid_set.map(
    preprocess_function,
    batched=True,
    remove_columns=valid_set.column_names)

Map:   0%|          | 0/6676 [00:00<?, ? examples/s]

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=MODEL_NAME)

In [21]:
metric = evaluate.load("sacrebleu")

In [22]:
def postprocess_text(preds, labels):
  preds = [pred.strip() for pred in preds]
  labels = [[label.strip()] for label in labels]
  return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # -100은 건너뛴다.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result ={"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

## Train the Model

In [23]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

In [24]:
gc.collect()
torch.cuda.empty_cache()
# torch.cuda.memory_summary(device=None, abbreviated=False)

In [25]:
BATCH_SIZE = 16
LEARNING_RATE= 2e-5
NUM_EPOCHS= 10
WEIGHT_DECAY = 0.1

In [26]:
training_args  = Seq2SeqTrainingArguments(
    output_dir = 'aihub_translation',
    evaluation_strategy="epoch",
    learning_rate = LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    gradient_accumulation_steps = 8,
    predict_with_generate=True,
    fp16=True, # 고속화 loose한 정확도
    gradient_checkpointing=True # 메모리 절약 대신 느려짐
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_trainset,
    eval_dataset=tokenized_validset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
0,No log,1.8513,28.2664,40.8
2,2.143400,1.601588,32.3978,40.7569
4,1.675600,1.516072,34.1255,41.0473
6,1.514200,1.476454,34.906,41.1297
8,1.431500,1.460195,35.2945,41.1892
9,1.431500,1.458382,35.304,41.1833


TrainOutput(global_step=2430, training_loss=1.6382193106192129, metrics={'train_runtime': 7772.9726, 'train_samples_per_second': 40.08, 'train_steps_per_second': 0.313, 'total_flos': 5267041077952512.0, 'train_loss': 1.6382193106192129, 'epoch': 9.98})

## Inference

In [28]:
translator = pipeline("translation",  model=model, tokenizer=tokenizer, device= 0)

In [29]:
translated_reference = []
for idx, (kor, eng) in tqdm(enumerate(zip(test_set['korean'], test_set['english'])), total = len(test_set)):
  translated = translator(kor)
  translated_reference.append([translated[0]['translation_text'], eng ])


100%|██████████| 6677/6677 [57:19<00:00,  1.94it/s]


In [30]:
pd.DataFrame(translated_reference, columns =['Translation', 'Reference'])

Unnamed: 0,Translation,Reference
0,"Accordingly, God trains them to have the ""nobl...","""But you are a chosen people, a royal priestho..."
1,The author of Ep 5:33 said that the golden pri...,"""Each one of you also must love his wife as he..."
2,“ Even the hairs of your head are all numbered...,"He says, ""Indeed, the very hairs of your head ..."
3,"1:11, “Do not bring back any vain offering aga...","""The multitude of your sacrifices-what are the..."
4,"( Job 8:7) Lee Chung-sook, a Seoul Gangseo-gu ...","""Your beginnings will seem humble, so prospero..."
...,...,...
6672,"In addition, Jung-gu, Jung-gu District is work...","In addition, Jung-gu District works with Jung-..."
6673,It was first opened at the end of November las...,Under agreement for co-development of traditio...
6674,"Basung-seok, who participated in the program w...","Bae Sung-seok (tentative name, Hwang Hak-dong)..."
6675,"The Mentoling Study Room, operated by Jung-gu ...","The Mentoring Study Room, which Jung-gu Distri..."
