In [1]:
import numpy as np
import pandas as pd
import pymysql
import pymysql.cursors as cursors
import torch
import multiprocessing

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, load_metric


In [2]:
# !pip install datasets


In [3]:
# 데이터 베이스 연결하기
conn = pymysql.connect(
    host="1.251.203.204",
    user="root",
    password="kdt5",
    db="Team4",
    charset="utf8",
    port=33065,
)

curs = conn.cursor()

# 검색 명령어 사용
sql = "SELECT en.text as en, de.text as de FROM language_en en join language_de de on en.id = de.id;"
curs.execute(sql)
result = curs.fetchall()
print("현재 테이블의 데이터수는 총 {}개 입니다.".format(len(result)))
endeDF = pd.read_sql(sql, conn)

# 데이터베이스 연결 종료
conn.close()
endeDF.head()


현재 테이블의 데이터수는 총 31102개 입니다.


  endeDF = pd.read_sql(sql, conn)


Unnamed: 0,en,de
0,In the beginning God created the heaven and th...,Am Anfang schuf Gott Himmel und Erde.
1,"And the earth was without form, and void; and ...","Und die Erde war wüst und leer, und es war fin..."
2,"And God said, Let there be light: and there wa...",Und Gott sprach: Es werde Licht! und es ward L...
3,"And God saw the light, that it was good: and G...","Und Gott sah, daß das Licht gut war. Da schied..."
4,"And God called the light Day, and the darkness...",und nannte das Licht Tag und die Finsternis Na...


In [4]:
num_train = 30000
num_valid = 1000
num_test = 1102

bible_trainDF = endeDF.iloc[:num_train]
bible_validDF = endeDF.iloc[num_train : num_train + num_valid]
bible_testDF = endeDF.iloc[num_train + num_valid :]

bible_trainDF.to_csv("./data/train.tsv", sep="\t", index=False)
bible_validDF.to_csv("./data/valid.tsv", sep="\t", index=False)
bible_testDF.to_csv("./data/test.tsv", sep="\t", index=False)

data_files = {
    "train": "./data/train.tsv",
    "valid": "./data/valid.tsv",
    "test": "./data/test.tsv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t")


Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'de'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['en', 'de'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['en', 'de'],
        num_rows: 102
    })
})

In [6]:
print(dataset["train"][:3]["en"])
print(dataset["train"][:3]["de"])


['In the beginning God created the heaven and the earth.', 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.', 'And God said, Let there be light: and there was light.']
['Am Anfang schuf Gott Himmel und Erde.', 'Und die Erde war wüst und leer, und es war finster auf der Tiefe; und der Geist Gottes schwebte auf dem Wasser.', 'Und Gott sprach: Es werde Licht! und es ward Licht.']


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [8]:
model_ckpt = "Reyansh4/NMT_T5_wmt14_en_to_de"
max_token_length = 128


In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


In [10]:
def convert_examples_to_features(tokenizer, max_token_length, examples):
    model_inputs = tokenizer(
        examples["en"],
        text_target=examples["de"],
        max_length=max_token_length,
        truncation=True,
    )

    return model_inputs

In [11]:
NUM_CPU = multiprocessing.cpu_count()
NUM_CPU


20

In [12]:
from functools import partial

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
partial_tokenize_function = partial(
    convert_examples_to_features, tokenizer, max_token_length
)
tokenized_datasets = dataset.map(
    partial_tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=NUM_CPU,
)

Map (num_proc=20):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/102 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

In [14]:
print("원 데이터    :", dataset["train"][0]["en"])
print("처리 후 데이터:", tokenized_datasets["train"][0]["input_ids"])
print(
    "토큰화       :",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["input_ids"]),
)

print("\n")
print("원 데이터    :", dataset["train"][0]["de"])
print(
    "처리 후 데이터:",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["labels"]),
)
print("토큰화       :", tokenized_datasets["train"][0]["labels"])


원 데이터    : In the beginning God created the heaven and the earth.
처리 후 데이터: [86, 8, 1849, 601, 990, 8, 9922, 11, 8, 3596, 5, 1]
토큰화       : ['▁In', '▁the', '▁beginning', '▁God', '▁created', '▁the', '▁heaven', '▁and', '▁the', '▁earth', '.', '</s>']


원 데이터    : Am Anfang schuf Gott Himmel und Erde.
처리 후 데이터: ['▁Am', '▁Anfang', '▁', 's', 'chu', 'f', '▁Gott', '▁Himmel', '▁und', '▁Erd', 'e', '.', '</s>']
토큰화       : [736, 11302, 3, 7, 8019, 89, 11922, 23537, 64, 11948, 15, 5, 1]


In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


In [16]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [17]:
# 콜레이터를 돌리면 알아서 패딩하고 쉬프트 시킨다.
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])


In [18]:
# !pip install sacrebleu


In [19]:
import evaluate

metric = evaluate.load("sacrebleu")


In [20]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result

In [21]:
# 학습 인자를 정의
training_args = Seq2SeqTrainingArguments(
    output_dir="chkpt",
    learning_rate=0.0005, # 러닝메이트
    weight_decay=0.01, # 가중치 감소 설정
    per_device_train_batch_size=16, # 학습 배치크기 설정
    per_device_eval_batch_size=16, # 평가 배치크기 설정
    num_train_epochs=10, # 학습 에폭 수 설정
    evaluation_strategy="epoch", # 각 에포크마다 평가 진행
    predict_with_generate=True, # 예측 생성
    fp16=False,
    gradient_accumulation_steps=2,
    report_to="none",  
)


In [22]:
import transformers

transformers.__version__


'4.40.0'

In [23]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [24]:
trainer.train()


  0%|          | 0/9370 [00:00<?, ?it/s]



  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.1691265106201172, 'eval_bleu': 10.036239850998216, 'eval_runtime': 14.2799, 'eval_samples_per_second': 70.028, 'eval_steps_per_second': 4.412, 'epoch': 1.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.1003153324127197, 'eval_bleu': 10.534353709215408, 'eval_runtime': 14.2544, 'eval_samples_per_second': 70.154, 'eval_steps_per_second': 4.42, 'epoch': 2.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.0875025987625122, 'eval_bleu': 10.945294999288066, 'eval_runtime': 14.3222, 'eval_samples_per_second': 69.822, 'eval_steps_per_second': 4.399, 'epoch': 3.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.0746161937713623, 'eval_bleu': 11.003590950171388, 'eval_runtime': 14.5909, 'eval_samples_per_second': 68.536, 'eval_steps_per_second': 4.318, 'epoch': 4.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.0909823179244995, 'eval_bleu': 10.74565132149491, 'eval_runtime': 14.517, 'eval_samples_per_second': 68.885, 'eval_steps_per_second': 4.34, 'epoch': 5.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.1062045097351074, 'eval_bleu': 10.879212801197705, 'eval_runtime': 14.826, 'eval_samples_per_second': 67.449, 'eval_steps_per_second': 4.249, 'epoch': 6.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.1418358087539673, 'eval_bleu': 10.696098729136404, 'eval_runtime': 15.0813, 'eval_samples_per_second': 66.307, 'eval_steps_per_second': 4.177, 'epoch': 7.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.1560431718826294, 'eval_bleu': 10.720104560949954, 'eval_runtime': 14.4523, 'eval_samples_per_second': 69.193, 'eval_steps_per_second': 4.359, 'epoch': 8.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.181026816368103, 'eval_bleu': 10.912948966033897, 'eval_runtime': 14.6703, 'eval_samples_per_second': 68.165, 'eval_steps_per_second': 4.294, 'epoch': 9.0}




  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 1.2026101350784302, 'eval_bleu': 10.842495455052237, 'eval_runtime': 14.6568, 'eval_samples_per_second': 68.228, 'eval_steps_per_second': 4.298, 'epoch': 9.99}
{'train_runtime': 2943.0345, 'train_samples_per_second': 101.936, 'train_steps_per_second': 3.184, 'train_loss': 0.7987311461946371, 'epoch': 9.99}


TrainOutput(global_step=9370, training_loss=0.7987311461946371, metrics={'train_runtime': 2943.0345, 'train_samples_per_second': 101.936, 'train_steps_per_second': 3.184, 'total_flos': 2.565766049144832e+16, 'train_loss': 0.7987311461946371, 'epoch': 9.994666666666667})

In [25]:
trainer.save_model("./result")


In [26]:
model_dir = "./result"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

model.cpu()


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [27]:
input_text = [
    "In the beginning God created the heaven and the earth.",
    "And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.",
    "I'm hungry.",
]


In [28]:
inputs = tokenizer(
    input_text, return_tensors="pt", padding=True, max_length=max_token_length
)




In [29]:
frenchs = model.generate(
    **inputs,
    max_length=max_token_length,
    num_beams=5,
)

frenchs.shape


torch.Size([3, 40])

In [30]:
[
    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(french))
    for french in frenchs
]


['<pad> Am Anfang schuf Gott Himmel und Erde.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> Und die Erde war wüst und leer, und Finsternis war auf der Tiefe; und der Geist Gottes erhob sich auf dem Wasser.</s>',
 '<pad> Ich hungere.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>']

In [31]:
pred_text = tokenizer.batch_decode(frenchs, skip_special_tokens=True)
pred_text


['Am Anfang schuf Gott Himmel und Erde.',
 'Und die Erde war wüst und leer, und Finsternis war auf der Tiefe; und der Geist Gottes erhob sich auf dem Wasser.',
 'Ich hungere.']

In [32]:
from torch.utils.data import DataLoader

test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=32, collate_fn=data_collator
)


In [33]:
test_dataloader_iter = iter(test_dataloader)
test_batch = next(test_dataloader_iter)


In [34]:
test_input = {key: test_batch[key] for key in ("input_ids", "attention_mask")}


In [35]:
frenchs = model.generate(
    **test_input,
    max_length=max_token_length,
    num_beams=5,
)


In [36]:
labels = np.where(test_batch.labels != -100, test_batch.labels, tokenizer.pad_token_id)
eng_sents = tokenizer.batch_decode(test_batch.input_ids, skip_special_tokens=True)[:3]
references = tokenizer.batch_decode(labels, skip_special_tokens=True)[:3]
preds = tokenizer.batch_decode(frenchs, skip_special_tokens=True)[:3]


In [37]:
for s in zip(eng_sents, references, preds):
    print("English   :", s[0])
    print("Reference :", s[1])
    print("Translated:", s[2])
    print("\n")


English   : How much she hath glorified herself, and lived deliciously, so much torment and sorrow give her: for she saith in her heart, I sit a queen, and am no widow, and shall see no sorrow.
Reference : Wieviel sie herrlich gemacht und ihren Mutwillen gehabt hat, so viel schenket ihr Qual und Leid ein! Denn sie spricht in ihrem Herzen: Ich sitze als Königin und bin keine Witwe, und Leid werde ich nicht sehen.
Translated: Wie sehr sie sich rühmt und köstlich lebt, so viel Qual und Traurigkeit geben sie; denn sie spricht in ihrem Herzen: Ich bin eine Königin, und ich bin keine Witwe, und wirst keine Traurigkeit sehen.


English   : Therefore shall her plagues come in one day, death, and mourning, and famine; and she shall be utterly burned with fire: for strong is the Lord God who judgeth her.
Reference : Darum werden ihre Plagen auf einen Tag kommen: Tod, Leid und Hunger; mit Feuer wird sie verbrannt werden; denn stark ist Gott der HERR, der sie richten wird.
Translated: Darum werden