In [2]:
import json
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import evaluate

# 1. 모델과 토크나이저 로드
model_name = "KETI-AIR/ke-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 2. 데이터 로드
file_path = "C:/Users/asia/Desktop/파이널프로젝트/03_데이터전처리/순화표현모델 데이터_리라이팅완료.jsonl"
with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

# 3. 데이터 전처리
def clean_text(text):
    text = re.sub(r"(ㅋ){3,}", "ㅋㅋ", text)
    text = re.sub(r"(ㅎ){3,}", "ㅎㅎ", text)
    text = re.sub(r"(;){2,}", ";", text)
    text = re.sub(r"(\.{2,})", "...", text)
    text = re.sub(r"(!){2,}", "!!", text)
    text = re.sub(r"(\?){2,}", "??", text)
    text = re.sub(r"([!?.,])", r" \1 ", text)
    text = re.sub(r"([~❤💢💥💬])", r" \1 ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def preprocess(example):
    # 'context'와 'output' 키를 가정하여 수정
    if "context" not in example or "output" not in example:
        return None # 잘못된 형식의 데이터는 건너뛰기
    
    # 마지막 발화 전까지만 context로 사용
    context = "\n".join(example["context"][:-1])
    input_text = f"rephrase politely:\n{context}\n{example['context'][-1]}"
    
    input_text = clean_text(input_text)
    target_text = clean_text(example["output"])
    return {
        "input_text": input_text,
        "target_text": target_text
    }

# 전처리 적용
processed_data = [preprocess(ex) for ex in data if preprocess(ex) is not None]

# 4. Dataset 클래스 정의
class RewriteDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item["input_text"]
        target_text = item["target_text"]

        model_inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                target_text,
                max_length=self.max_length,
                padding="max_length",
                truncation=True,
                return_tensors="pt"
            )

        model_inputs = {k: v.squeeze() for k, v in model_inputs.items()}
        model_inputs["labels"] = labels["input_ids"].squeeze()

        return model_inputs

# 5. 데이터 분할 및 데이터셋 생성
train_data, test_data = train_test_split(processed_data, test_size=0.1, random_state=42)
train_dataset = RewriteDataset(train_data, tokenizer)
test_dataset = RewriteDataset(test_data, tokenizer)

# 6. 평가 지표 로드 및 함수 정의
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

     # -100 패딩 토큰을 ignore_index로 변경 (디코딩 시 무시되도록)
    preds = np.where(preds == -100, tokenizer.pad_token_id, preds)
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU는 토큰화 필요
    bleu_score = bleu_metric.compute(
        predictions=decoded_preds,
        references=[[label] for label in decoded_labels]
    )["bleu"]

    # ROUGE-L 계산
    rouge_result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )


     for i in range(min(5, len(decoded_preds))):
        print(f"예측: {decoded_preds[i]}")
        print(f"정답: {decoded_labels[i]}")
        print("---")
         
    return {
        "bleu": bleu_score,
        "rouge-L": rouge_result["rougeL"]
    }


# 7. Training Arguments 설정
training_args = Seq2SeqTrainingArguments(
    output_dir="./ke-t5-rewrite-small",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=4,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="rouge-L", # ROUGE-L을 최적 모델 선정 기준으로 변경
    greater_is_better=True,
    lr_scheduler_type="linear",
    warmup_steps=100
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 8. Trainer 인스턴스 생성 및 학습
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 9. 학습 시작
trainer.train()
trainer.save_model("./ke-t5-rewrite-small")
tokenizer.save_pretrained("./ke-t5-rewrite-small")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bleu,Rouge-l
500,22.8635,18.374266,0.0,0.0
1000,1.4803,1.22901,0.0,0.0




OverflowError: can't convert negative int to unsigned