In [1]:
import numpy as np
import pandas as pd
import pymysql
import pymysql.cursors as cursors
import torch
import multiprocessing

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, load_metric

In [2]:
# !pip install datasets

In [3]:
# 데이터 베이스 연결하기
conn = pymysql.connect(host='1.251.203.204',
                       user='root',
                       password='kdt5',
                       db='Team4',
                       charset='utf8',
                       port=33065)

curs = conn.cursor()

# 검색 명령어 사용 
sql = "SELECT eng.text as en, kor.text as ko FROM language_en eng join language_ko kor on eng.id = kor.id;"
curs.execute(sql)
result = curs.fetchall()
print("현재 테이블의 데이터수는 총 {}개 입니다.".format(len(result)))
endeDF = pd.read_sql(sql, conn)

# 데이터베이스 연결 종료
conn.close()
endeDF.head()

현재 테이블의 데이터수는 총 31102개 입니다.


  endeDF = pd.read_sql(sql, conn)


Unnamed: 0,en,ko
0,In the beginning God created the heaven and th...,태초에 하나님이 천지를 창조하시니라
1,"And the earth was without form, and void; and ...",땅이 혼돈하고 공허하며 흑암이 깊음 위에 있고 하나님의 신은 수면에 운행하시니라
2,"And God said, Let there be light: and there wa...",하나님이 가라사대 빛이 있으라 하시매 빛이 있었고
3,"And God saw the light, that it was good: and G...",그 빛이 하나님의 보시기에 좋았더라 하나님이 빛과 어두움을 나누사
4,"And God called the light Day, and the darkness...",빛을 낮이라 칭하시고 어두움을 밤이라 칭하시니라 저녁이 되며 아침이 되니 이는 첫째...


In [4]:
num_train = 30000
num_valid = 1000
num_test = 1102

bible_trainDF = endeDF.iloc[:num_train]
bible_validDF = endeDF.iloc[num_train : num_train + num_valid]
bible_testDF = endeDF.iloc[num_train + num_valid :]

bible_trainDF.to_csv("./data/train.tsv", sep="\t", index=False)
bible_validDF.to_csv("./data/valid.tsv", sep="\t", index=False)
bible_testDF.to_csv("./data/test.tsv", sep="\t", index=False)

data_files = {
    "train": "./data/train.tsv",
    "valid": "./data/valid.tsv",
    "test": "./data/test.tsv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ko'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['en', 'ko'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['en', 'ko'],
        num_rows: 102
    })
})

In [6]:
print(dataset["train"][:3]["en"])
print(dataset["train"][:3]["ko"])

['In the beginning God created the heaven and the earth.', 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.', 'And God said, Let there be light: and there was light.']
['태초에 하나님이 천지를 창조하시니라', '땅이 혼돈하고 공허하며 흑암이 깊음 위에 있고 하나님의 신은 수면에 운행하시니라', '하나님이 가라사대 빛이 있으라 하시매 빛이 있었고']


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [8]:
model_ckpt = "KETI-AIR/ke-t5-base" # "dylanmengzhou/kobart-trans-en-ko-v2" # "Reyansh4/NMT_T5_wmt14_en_to_de"
max_token_length = 64

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
def convert_examples_to_features(tokenizer, max_token_length, examples):
    model_inputs = tokenizer(
        examples["en"],
        text_target=examples["ko"],
        max_length=max_token_length,
        truncation=True,
    )

    return model_inputs

In [11]:
NUM_CPU = multiprocessing.cpu_count()
NUM_CPU

20

In [12]:
from functools import partial

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
partial_tokenize_function = partial(
    convert_examples_to_features, tokenizer, max_token_length
)
tokenized_datasets = dataset.map(
    partial_tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=NUM_CPU,
)

Map (num_proc=20):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/102 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

In [14]:
print("원 데이터    :", dataset["train"][0]["en"])
print("처리 후 데이터:", tokenized_datasets["train"][0]["input_ids"])
print(
    "토큰화       :",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["input_ids"]),
)

print("\n")
print("원 데이터    :", dataset["train"][0]["ko"])
print(
    "처리 후 데이터:",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["labels"]),
)
print("토큰화       :", tokenized_datasets["train"][0]["labels"])

원 데이터    : In the beginning God created the heaven and the earth.
처리 후 데이터: [215, 5, 5558, 4412, 3949, 5, 39154, 13, 5, 17937, 3, 1]
토큰화       : ['▁In', '▁the', '▁beginning', '▁God', '▁created', '▁the', '▁heaven', '▁and', '▁the', '▁earth', '.', '</s>']


원 데이터    : 태초에 하나님이 천지를 창조하시니라
처리 후 데이터: ['▁태', '초', '에', '▁하나님이', '▁천지', '를', '▁창조', '하시', '니', '라', '</s>']
토큰화       : [3346, 741, 9, 36812, 27373, 21, 6439, 11366, 434, 222, 1]


In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
# 콜레이터를 돌리면 알아서 패딩하고 쉬프트 시킨다.
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

In [18]:
# !pip install sacrebleu
# ! pip install evaluate

In [19]:
import evaluate

metric = evaluate.load("sacrebleu")

In [20]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

In [21]:
# ! pip install transformers[torch]

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="chkpt", # 자동으로 생성
    learning_rate=0.0005,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_strategy="no",
    predict_with_generate=True,
    fp16=False,
    gradient_accumulation_steps=2,
    report_to="none",  # Wandb 로그 끄기
)

In [23]:
# import transformers
# transformers.__version__

In [24]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [25]:
trainer.train()

  0%|          | 0/9370 [00:00<?, ?it/s]



  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 4.305810451507568, 'eval_bleu': 0.8119458078517806, 'eval_runtime': 15.819, 'eval_samples_per_second': 63.215, 'eval_steps_per_second': 3.983, 'epoch': 1.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 3.4602484703063965, 'eval_bleu': 2.1547281563216902, 'eval_runtime': 15.7091, 'eval_samples_per_second': 63.657, 'eval_steps_per_second': 4.01, 'epoch': 2.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 3.152926445007324, 'eval_bleu': 3.162163444677091, 'eval_runtime': 15.5132, 'eval_samples_per_second': 64.461, 'eval_steps_per_second': 4.061, 'epoch': 3.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.980804920196533, 'eval_bleu': 3.6337775167362985, 'eval_runtime': 15.5843, 'eval_samples_per_second': 64.167, 'eval_steps_per_second': 4.043, 'epoch': 4.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.9166676998138428, 'eval_bleu': 4.226295640878786, 'eval_runtime': 15.5701, 'eval_samples_per_second': 64.226, 'eval_steps_per_second': 4.046, 'epoch': 5.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.8608696460723877, 'eval_bleu': 3.947556562472322, 'eval_runtime': 15.9685, 'eval_samples_per_second': 62.623, 'eval_steps_per_second': 3.945, 'epoch': 6.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.8427369594573975, 'eval_bleu': 4.450759305550124, 'eval_runtime': 15.1577, 'eval_samples_per_second': 65.973, 'eval_steps_per_second': 4.156, 'epoch': 7.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.8348772525787354, 'eval_bleu': 4.2885522086047985, 'eval_runtime': 15.3588, 'eval_samples_per_second': 65.109, 'eval_steps_per_second': 4.102, 'epoch': 8.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.837285041809082, 'eval_bleu': 4.434871113431575, 'eval_runtime': 15.8389, 'eval_samples_per_second': 63.136, 'eval_steps_per_second': 3.978, 'epoch': 9.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.85461163520813, 'eval_bleu': 4.55580723032464, 'eval_runtime': 15.9498, 'eval_samples_per_second': 62.697, 'eval_steps_per_second': 3.95, 'epoch': 9.99}
{'train_runtime': 2734.0077, 'train_samples_per_second': 109.729, 'train_steps_per_second': 3.427, 'train_loss': 2.7133204375667024, 'epoch': 9.99}


TrainOutput(global_step=9370, training_loss=2.7133204375667024, metrics={'train_runtime': 2734.0077, 'train_samples_per_second': 109.729, 'train_steps_per_second': 3.427, 'total_flos': 2.192098069536768e+16, 'train_loss': 2.7133204375667024, 'epoch': 9.994666666666667})

In [26]:
trainer.save_model("./model/en2ko_T5")

In [27]:
model_dir = "./model/en2ko_T5"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

model.cpu()

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


T5ForConditionalGeneration(
  (shared): Embedding(64128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(64128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [28]:
input_text = [
    "In the beginning God created the heaven and the earth.",
    "And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.",
    "I'm hungry."
]

In [29]:
inputs = tokenizer(
    input_text, return_tensors="pt", padding=True, max_length=max_token_length
)



In [30]:
koreans = model.generate(
    **inputs,
    max_length=max_token_length,
    num_beams=5,
)

koreans.shape

torch.Size([3, 24])

In [31]:
# [
#     tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(french))
#     for french in frenchs
# ]

[
    tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(korean)) for korean in koreans
]

['<pad> 하나님이 처음부터 천지를 창조하시니라</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> 땅이 공허하고 공허하며 흑암이 깊음에 있음이여 하나님의 신이 물 위에 진동하시니라</s>',
 '<pad> 내가 주리니</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>']

In [32]:
pred_text = tokenizer.batch_decode(koreans, skip_special_tokens=True)
pred_text

['하나님이 처음부터 천지를 창조하시니라',
 '땅이 공허하고 공허하며 흑암이 깊음에 있음이여 하나님의 신이 물 위에 진동하시니라',
 '내가 주리니']

In [33]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=32, collate_fn=data_collator
)

In [34]:
test_dataloader_iter = iter(test_dataloader)
test_batch = next(test_dataloader_iter)

In [35]:
test_input = {key: test_batch[key] for key in ("input_ids", "attention_mask")}

In [36]:
koreans = model.generate(
    **test_input,
    max_length=max_token_length,
    num_beams=5,
)

In [37]:
labels = np.where(test_batch.labels != -100, test_batch.labels, tokenizer.pad_token_id)
eng_sents = tokenizer.batch_decode(test_batch.input_ids, skip_special_tokens=True)[:3]
references = tokenizer.batch_decode(labels, skip_special_tokens=True)[:3]
preds = tokenizer.batch_decode(koreans, skip_special_tokens=True)[:3]

In [38]:
for s in zip(eng_sents, references, preds):
    print("English   :", s[0])
    print("Reference :", s[1])
    print("Translated:", s[2])
    print("\n")

English   : How much she hath glorified herself, and lived deliciously, so much torment and sorrow give her: for she saith in her heart, I sit a queen, and am no widow, and shall see no sorrow.
Reference : 그가 어떻게 자기를 영화롭게 하였으며 사치하였든지 그만큼 고난과 애통으로 갚아 주라 그가 마음에 말하기를 나는 여황으로 앉은 자요 과부가 아니라 결단코 애통을 당하지 아니하리라 하니
Translated: 그가 자기를 영화롭게 하고 맛보게 하였으므로 고통과 근심이 그에게 더하니 이는 그가 마음에 이르기를 나는 왕후라 과부라 근심이 없느니라 하였음이라


English   : Therefore shall her plagues come in one day, death, and mourning, and famine; and she shall be utterly burned with fire: for strong is the Lord God who judgeth her.
Reference : 그러므로 하루 동안에 그 재앙들이 이르리니 곧 사망과 애통과 흉년이라 그가 또한 불에 살라지리니 그를 심판하신 주 하나님은 강하신 자이심이니라
Translated: 그러므로 그 재앙 곧 사망과 애통과 기근이 하루에 이르러 진멸하리니 이는 저를 판단하시는 주 하나님이 강하시니라


English   : And the kings of the earth, who have committed fornication and lived deliciously with her, shall bewail her, and lament for her, when they shall see the smoke of her burning,
Reference : 그와 함께 음행하고 사치하던 땅의 왕들이 그 불붙는 연기를 보고 위하여 울고 가슴을 치며
T