In [1]:
import numpy as np
import pandas as pd
import pymysql
import pymysql.cursors as cursors
import torch
import multiprocessing

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, load_metric


In [2]:
conn = pymysql.connect(
    host="1.251.203.204",
    port=33065,
    user="root",
    password="kdt5",
    db="Team4",
    charset="utf8",
)
cur = conn.cursor(cursors.DictCursor)


In [3]:
sql = """
select en.text as en, fr.text as fr
from language_en as en
inner join language_fr as fr
on en.id = fr.id
"""


In [4]:
cur.execute(sql)
langDF_ori = pd.DataFrame(cur.fetchall())
cur.close()
conn.close()


In [5]:
langDF_ori


Unnamed: 0,en,fr
0,In the beginning God created the heaven and th...,"Au commencement, Dieu créa les cieux et la terre."
1,"And the earth was without form, and void; and ...",La terre était informe et vide: il y avait des...
2,"And God said, Let there be light: and there wa...",Dieu dit: Que la lumière soit! Et la lumière fut.
3,"And God saw the light, that it was good: and G...",Dieu vit que la lumière était bonne; et Dieu s...
4,"And God called the light Day, and the darkness...","Dieu appela la lumière jour, et il appela les ..."
...,...,...
31097,"And the Spirit and the bride say, Come. And le...",Et l`Esprit et l`épouse disent: Viens. Et que ...
31098,For I testify unto every man that heareth the ...,Je le déclare à quiconque entend les paroles d...
31099,And if any man shall take away from the words ...,et si quelqu`un retranche quelque chose des pa...
31100,"He which testifieth these things saith, Surely...","Celui qui atteste ces choses dit: Oui, je vien..."


In [6]:
num_train = 30000
num_valid = 1000
num_test = 1102

bible_trainDF = langDF_ori.iloc[:num_train]
bible_validDF = langDF_ori.iloc[num_train : num_train + num_valid]
bible_testDF = langDF_ori.iloc[num_train + num_valid :]

bible_trainDF.to_csv("./date_files/train.tsv", sep="\t", index=False)
bible_validDF.to_csv("./date_files/valid.tsv", sep="\t", index=False)
bible_testDF.to_csv("./date_files/test.tsv", sep="\t", index=False)


In [7]:
data_files = {
    "train": "./date_files/train.tsv",
    "valid": "./date_files/valid.tsv",
    "test": "./date_files/test.tsv",
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")


Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'fr'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['en', 'fr'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['en', 'fr'],
        num_rows: 102
    })
})

In [9]:
print(dataset["train"][:3]["en"])
print(dataset["train"][:3]["fr"])


['In the beginning God created the heaven and the earth.', 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.', 'And God said, Let there be light: and there was light.']
['Au commencement, Dieu créa les cieux et la terre.', 'La terre était informe et vide: il y avait des ténèbres à la surface de l`abîme, et l`esprit de Dieu se mouvait au-dessus des eaux.', 'Dieu dit: Que la lumière soit! Et la lumière fut.']


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

In [11]:
model_ckpt = "Demosthene-OR/t5-base-finetuned-en-to-fr"
max_token_length = 64


In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [13]:
def convert_examples_to_features(tokenizer, max_token_length, examples):
    model_inputs = tokenizer(
        examples["en"],
        text_target=examples["fr"],
        max_length=max_token_length,
        truncation=True,
    )

    return model_inputs


In [14]:
NUM_CPU = multiprocessing.cpu_count()
NUM_CPU


8

In [15]:
from functools import partial

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
partial_tokenize_function = partial(
    convert_examples_to_features, tokenizer, max_token_length
)
tokenized_datasets = dataset.map(
    partial_tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=NUM_CPU,
)


Map (num_proc=8):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/102 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

In [17]:
print("원 데이터    :", dataset["train"][0]["en"])
print("처리 후 데이터:", tokenized_datasets["train"][0]["input_ids"])
print(
    "토큰화       :",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["input_ids"]),
)

print("\n")
print("원 데이터    :", dataset["train"][0]["fr"])
print(
    "처리 후 데이터:",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["labels"]),
)
print("토큰화       :", tokenized_datasets["train"][0]["labels"])


원 데이터    : In the beginning God created the heaven and the earth.
처리 후 데이터: [86, 8, 1849, 601, 990, 8, 9922, 11, 8, 3596, 5, 1]
토큰화       : ['▁In', '▁the', '▁beginning', '▁God', '▁created', '▁the', '▁heaven', '▁and', '▁the', '▁earth', '.', '</s>']


원 데이터    : Au commencement, Dieu créa les cieux et la terre.
처리 후 데이터: ['▁Au', '▁commence', 'ment', ',', '▁Dieu', '▁cré', 'a', '▁les', '▁', 'c', 'ieux', '▁', 'e', 't', '▁la', '▁terre', '.', '</s>']
토큰화       : [1957, 9158, 297, 6, 15453, 8261, 9, 110, 3, 75, 11891, 3, 15, 17, 50, 10225, 5, 1]


In [18]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [21]:
import evaluate

metric = evaluate.load("sacrebleu")


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [22]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result


In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="chkpt",
    learning_rate=0.0005,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_strategy="no",
    predict_with_generate=True,
    fp16=False,
    gradient_accumulation_steps=2,
    report_to="none",  # Wandb 로그 끄기
)


In [25]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [26]:
trainer.train()


***** Running training *****
  Num examples = 30000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 937
  Number of trainable parameters = 222903552


  0%|          | 0/937 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [27]:
trainer.save_model("./results")


Saving model checkpoint to ./results
Configuration saved in ./results\config.json
Model weights saved in ./results\pytorch_model.bin
tokenizer config file saved in ./results\tokenizer_config.json
Special tokens file saved in ./results\special_tokens_map.json
Copy vocab file to ./results\spiece.model


In [31]:
model_dir = "./results"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

model.cpu()


loading file spiece.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 958 column 3

In [None]:
input_text = [
    "In the beginning God created the heaven and the earth.",
    "And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.",
]


In [None]:
inputs = tokenizer(
    input_text, return_tensors="pt", padding=True, max_length=max_token_length
)




In [None]:
inputs


{'input_ids': tensor([[   86,     8,  1849,   601,   990,     8,  9922,    11,     8,  3596,
             5,     1,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [  275,     8,  3596,    47,   406,   607,     6,    11,     3, 12186,
           117,    11, 14882,    47,  1286,     8,   522,    13,     8,  1659,
             5,   275,     8,  5876,    13,   601,  2301,  1286,     8,   522,
            13,     8, 13818,     5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
frenchs = model.generate(
    **inputs,
    max_length=max_token_length,
    num_beams=5,
)

frenchs.shape


torch.Size([2, 48])

In [None]:
[
    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(french))
    for french in frenchs
]


['<pad> Au commencement, Dieu créa le ciel et la terre.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> La terre était sans forme et vide; Les ténèbres étaient sur la face des eaux. Et l<unk>esprit de Dieu s<unk>éleva sur la face des eaux.</s>']

In [None]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=32, collate_fn=data_collator
)


In [None]:
test_dataloader_iter = iter(test_dataloader)
test_batch = next(test_dataloader_iter)


In [None]:
test_input = {key: test_batch[key] for key in ("input_ids", "attention_mask")}


In [None]:
frenchs = model.generate(
    **test_input,
    max_length=max_token_length,
    num_beams=5,
)


In [None]:
labels = np.where(test_batch.labels != -100, test_batch.labels, tokenizer.pad_token_id)
eng_sents = tokenizer.batch_decode(test_batch.input_ids, skip_special_tokens=True)[:3]
references = tokenizer.batch_decode(labels, skip_special_tokens=True)[:3]
preds = tokenizer.batch_decode(frenchs, skip_special_tokens=True)[:3]


In [None]:
for s in zip(eng_sents, references, preds):
    print("English   :", s[0])
    print("Reference :", s[1])
    print("Translated:", s[2])
    print("\n")


English   : How much she hath glorified herself, and lived deliciously, so much torment and sorrow give her: for she saith in her heart, I sit a queen, and am no widow, and shall see no sorrow.
Reference : Autant elle sest glorifiée et plongée dans le luxe, autant donnez-lui de tourment et de deuil. Parce quelle dit en son coeur: Je suis assise en reine, je 
Translated: Cest combien elle sest glorifiée et a vécu avec délice, et cest à cause de langoisse et de la douleur quelle a eue; car elle dit dans


English   : Therefore shall her plagues come in one day, death, and mourning, and famine; and she shall be utterly burned with fire: for strong is the Lord God who judgeth her.
Reference : A cause de cela, en un même jour, ses fléaux arriveront, la mort, le deuil et la famine, et elle sera consumée par le feu. Car il est puissant, le Seigneur Dieu qui la
Translated: Cest pourquoi ses plaies arriveront en un seul jour, la mort, le deuil et la famine, et elle sera brûlée par le feu; car l