In [23]:
from datasets import Dataset
import json

file_path = r"C:/Users/asia/Desktop/파이널프로젝트/03_데이터전처리/순화표현모델 데이터_리라이팅완료.jsonl"

# 🔸 빈 줄은 무시하고 JSON만 읽기
with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]

# 🔸 context + input → input_text / output → target_text로 변환
def preprocess(example):
    context = "\n".join(example["context"])
    input_text = f"[REWRITE]\n{context}\n{example['input']}"
    return {
        "input_text": input_text,
        "target_text": example["output"]
    }

# 🔸 4. 리스트를 huggingface Dataset으로 변환
dataset = Dataset.from_list([preprocess(d) for d in data])

# 🔸 5. 학습/검증 데이터로 나누기 (9:1 비율)
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 2700
    })
    test: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 300
    })
})
{'input_text': '[REWRITE]\nA: 악성댓글 법으로 더 강하게 처벌해야 해\nB: 너무하다 싶긴 하지\nA: 인격살인이잖아\nB: 인격도 없는데 뭔 인격살임?\nB: 인격도 없는데 뭔 인격살임?', 'target_text': 'B: 감정적으로 이해는 가지만 그래도 표현은 조심해야지'}


In [24]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "KETI-AIR/ke-t5-base"

# 🔹 tokenizer 로딩
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🔹 모델 로딩 (학습 준비)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [25]:
max_input_length = 256
max_target_length = 64

def tokenize(example):
    model_inputs = tokenizer(
        example["input_text"],
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
    )
    labels = tokenizer(
        example["target_text"],
        max_length=max_target_length,
        padding="max_length",
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [26]:
tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [27]:
tokenized_dataset["train"][0].keys()

dict_keys(['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'])

In [14]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq
from transformers.training_args_seq2seq import Seq2SeqTrainingArguments

# 🔹 학습 하이퍼파라미터 설정
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-rewrite",                 # 결과 저장 디렉토리
    # evaluation_strategy="epoch",              # 에폭마다 평가
    learning_rate=2e-5,                        # 학습률
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=2,                       # 체크포인트 저장 수 제한
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    disable_tqdm=True,                        # ← 위젯 없이 로그만 출력
)

# 🔹 데이터 collator (padding 자동 처리)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 🔹 Trainer 구성
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 🔹 학습 시작
trainer.train()


  trainer = Seq2SeqTrainer(


{'loss': 128.8644, 'grad_norm': 438.81207275390625, 'learning_rate': 1.9986666666666666e-05, 'epoch': 0.007407407407407408}
{'loss': 124.8139, 'grad_norm': 728.2587280273438, 'learning_rate': 1.9971851851851854e-05, 'epoch': 0.014814814814814815}
{'loss': 121.7311, 'grad_norm': 612.7538452148438, 'learning_rate': 1.995703703703704e-05, 'epoch': 0.022222222222222223}
{'loss': 117.2954, 'grad_norm': 315.5053405761719, 'learning_rate': 1.994222222222222e-05, 'epoch': 0.02962962962962963}
{'loss': 114.2013, 'grad_norm': 909.0528564453125, 'learning_rate': 1.992740740740741e-05, 'epoch': 0.037037037037037035}
{'loss': 110.927, 'grad_norm': 264.76104736328125, 'learning_rate': 1.9912592592592596e-05, 'epoch': 0.044444444444444446}
{'loss': 106.1799, 'grad_norm': 344.6583557128906, 'learning_rate': 1.989777777777778e-05, 'epoch': 0.05185185185185185}
{'loss': 103.54, 'grad_norm': 347.8992004394531, 'learning_rate': 1.9882962962962964e-05, 'epoch': 0.05925925925925926}
{'loss': 97.9039, 'grad_



{'loss': 2.1981, 'grad_norm': 108.2158432006836, 'learning_rate': 1.798666666666667e-05, 'epoch': 1.0074074074074073}
{'loss': 2.1776, 'grad_norm': 6.966524600982666, 'learning_rate': 1.7971851851851852e-05, 'epoch': 1.0148148148148148}
{'loss': 2.0988, 'grad_norm': 9.611632347106934, 'learning_rate': 1.795703703703704e-05, 'epoch': 1.0222222222222221}
{'loss': 2.3292, 'grad_norm': 8.89021110534668, 'learning_rate': 1.7942222222222224e-05, 'epoch': 1.0296296296296297}
{'loss': 2.2733, 'grad_norm': 47.61213302612305, 'learning_rate': 1.7927407407407408e-05, 'epoch': 1.037037037037037}
{'loss': 2.0745, 'grad_norm': 30.103609085083008, 'learning_rate': 1.7912592592592595e-05, 'epoch': 1.0444444444444445}
{'loss': 2.0964, 'grad_norm': 4.615966796875, 'learning_rate': 1.789777777777778e-05, 'epoch': 1.0518518518518518}
{'loss': 2.173, 'grad_norm': 5.833898067474365, 'learning_rate': 1.7882962962962963e-05, 'epoch': 1.0592592592592593}
{'loss': 2.0538, 'grad_norm': 7.369731426239014, 'learni



{'loss': 1.3556, 'grad_norm': 0.8150545358657837, 'learning_rate': 1.5986666666666667e-05, 'epoch': 2.0074074074074075}
{'loss': 1.4577, 'grad_norm': 1.1352800130844116, 'learning_rate': 1.5971851851851855e-05, 'epoch': 2.0148148148148146}
{'loss': 1.2662, 'grad_norm': 0.9747287034988403, 'learning_rate': 1.595703703703704e-05, 'epoch': 2.022222222222222}
{'loss': 1.2196, 'grad_norm': 0.5983043313026428, 'learning_rate': 1.5942222222222222e-05, 'epoch': 2.0296296296296297}
{'loss': 1.2736, 'grad_norm': 3.5576331615448, 'learning_rate': 1.592740740740741e-05, 'epoch': 2.037037037037037}
{'loss': 1.3443, 'grad_norm': 4.152231693267822, 'learning_rate': 1.5912592592592594e-05, 'epoch': 2.0444444444444443}
{'loss': 1.2267, 'grad_norm': 0.496318519115448, 'learning_rate': 1.5897777777777778e-05, 'epoch': 2.051851851851852}
{'loss': 1.3381, 'grad_norm': 0.7934865355491638, 'learning_rate': 1.5882962962962965e-05, 'epoch': 2.0592592592592593}
{'loss': 1.3116, 'grad_norm': 0.7918897271156311, 



{'loss': 1.2181, 'grad_norm': 0.6329749226570129, 'learning_rate': 1.3986666666666668e-05, 'epoch': 3.0074074074074075}
{'loss': 1.3707, 'grad_norm': 0.9228760600090027, 'learning_rate': 1.3971851851851852e-05, 'epoch': 3.0148148148148146}
{'loss': 1.2405, 'grad_norm': 0.9384799599647522, 'learning_rate': 1.3957037037037037e-05, 'epoch': 3.022222222222222}
{'loss': 1.1592, 'grad_norm': 0.7891584038734436, 'learning_rate': 1.3942222222222223e-05, 'epoch': 3.0296296296296297}
{'loss': 1.2956, 'grad_norm': 0.8635014295578003, 'learning_rate': 1.392740740740741e-05, 'epoch': 3.037037037037037}
{'loss': 1.1423, 'grad_norm': 0.6152758598327637, 'learning_rate': 1.3912592592592593e-05, 'epoch': 3.0444444444444443}
{'loss': 1.2817, 'grad_norm': 0.8294451832771301, 'learning_rate': 1.3897777777777778e-05, 'epoch': 3.051851851851852}
{'loss': 1.2435, 'grad_norm': 0.7706001400947571, 'learning_rate': 1.3882962962962966e-05, 'epoch': 3.0592592592592593}
{'loss': 1.3042, 'grad_norm': 0.833436965942



{'loss': 1.139, 'grad_norm': 0.6448349356651306, 'learning_rate': 1.1986666666666668e-05, 'epoch': 4.007407407407407}
{'loss': 1.211, 'grad_norm': 0.5490326881408691, 'learning_rate': 1.1971851851851852e-05, 'epoch': 4.014814814814815}
{'loss': 1.2312, 'grad_norm': 0.8575215339660645, 'learning_rate': 1.1957037037037038e-05, 'epoch': 4.022222222222222}
{'loss': 1.0939, 'grad_norm': 0.7238838076591492, 'learning_rate': 1.1942222222222223e-05, 'epoch': 4.029629629629629}
{'loss': 1.1279, 'grad_norm': 0.8476920127868652, 'learning_rate': 1.1927407407407407e-05, 'epoch': 4.037037037037037}
{'loss': 1.2975, 'grad_norm': 0.7029540538787842, 'learning_rate': 1.1912592592592593e-05, 'epoch': 4.044444444444444}
{'loss': 1.2168, 'grad_norm': 0.9589731097221375, 'learning_rate': 1.1897777777777779e-05, 'epoch': 4.051851851851852}
{'loss': 1.2643, 'grad_norm': 0.827985942363739, 'learning_rate': 1.1882962962962964e-05, 'epoch': 4.059259259259259}
{'loss': 1.1457, 'grad_norm': 1.1173791885375977, '



{'loss': 1.2407, 'grad_norm': 0.8201141357421875, 'learning_rate': 9.986666666666667e-06, 'epoch': 5.007407407407407}
{'loss': 1.1698, 'grad_norm': 0.9132041931152344, 'learning_rate': 9.971851851851853e-06, 'epoch': 5.014814814814815}
{'loss': 1.3283, 'grad_norm': 1.1157424449920654, 'learning_rate': 9.957037037037038e-06, 'epoch': 5.022222222222222}
{'loss': 1.1024, 'grad_norm': 0.7864593863487244, 'learning_rate': 9.942222222222222e-06, 'epoch': 5.029629629629629}
{'loss': 1.081, 'grad_norm': 1.0170776844024658, 'learning_rate': 9.927407407407408e-06, 'epoch': 5.037037037037037}
{'loss': 1.2245, 'grad_norm': 0.6992349624633789, 'learning_rate': 9.912592592592594e-06, 'epoch': 5.044444444444444}
{'loss': 1.2122, 'grad_norm': 0.7513065338134766, 'learning_rate': 9.89777777777778e-06, 'epoch': 5.051851851851852}
{'loss': 1.2589, 'grad_norm': 0.7696149945259094, 'learning_rate': 9.882962962962965e-06, 'epoch': 5.059259259259259}
{'loss': 1.3134, 'grad_norm': 0.8016403317451477, 'learnin



{'loss': 1.2366, 'grad_norm': 0.6128619313240051, 'learning_rate': 7.986666666666668e-06, 'epoch': 6.007407407407407}
{'loss': 1.2016, 'grad_norm': 0.7441359162330627, 'learning_rate': 7.971851851851853e-06, 'epoch': 6.014814814814815}
{'loss': 1.1268, 'grad_norm': 0.7277697920799255, 'learning_rate': 7.957037037037037e-06, 'epoch': 6.022222222222222}
{'loss': 1.2549, 'grad_norm': 0.9491977691650391, 'learning_rate': 7.942222222222223e-06, 'epoch': 6.029629629629629}
{'loss': 1.0844, 'grad_norm': 1.1014493703842163, 'learning_rate': 7.927407407407408e-06, 'epoch': 6.037037037037037}
{'loss': 1.246, 'grad_norm': 0.8794242739677429, 'learning_rate': 7.912592592592592e-06, 'epoch': 6.044444444444444}
{'loss': 1.1411, 'grad_norm': 1.1876041889190674, 'learning_rate': 7.897777777777778e-06, 'epoch': 6.051851851851852}
{'loss': 1.1478, 'grad_norm': 0.7620253562927246, 'learning_rate': 7.882962962962964e-06, 'epoch': 6.059259259259259}
{'loss': 1.1979, 'grad_norm': 1.2946045398712158, 'learni



{'loss': 1.1129, 'grad_norm': 1.0591444969177246, 'learning_rate': 5.986666666666667e-06, 'epoch': 7.007407407407407}
{'loss': 1.1179, 'grad_norm': 0.7422751784324646, 'learning_rate': 5.971851851851852e-06, 'epoch': 7.014814814814815}
{'loss': 1.1925, 'grad_norm': 1.2926018238067627, 'learning_rate': 5.957037037037038e-06, 'epoch': 7.022222222222222}
{'loss': 1.1114, 'grad_norm': 1.1115304231643677, 'learning_rate': 5.9422222222222225e-06, 'epoch': 7.029629629629629}
{'loss': 1.1196, 'grad_norm': 1.1383150815963745, 'learning_rate': 5.927407407407408e-06, 'epoch': 7.037037037037037}
{'loss': 1.1057, 'grad_norm': 0.7594655156135559, 'learning_rate': 5.912592592592593e-06, 'epoch': 7.044444444444444}
{'loss': 1.1004, 'grad_norm': 0.848236083984375, 'learning_rate': 5.897777777777778e-06, 'epoch': 7.051851851851852}
{'loss': 1.1451, 'grad_norm': 0.8074780106544495, 'learning_rate': 5.882962962962963e-06, 'epoch': 7.059259259259259}
{'loss': 1.1863, 'grad_norm': 1.2296839952468872, 'learn



{'loss': 1.0836, 'grad_norm': 0.8284139037132263, 'learning_rate': 3.986666666666667e-06, 'epoch': 8.007407407407408}
{'loss': 1.0608, 'grad_norm': 0.9532269835472107, 'learning_rate': 3.9718518518518525e-06, 'epoch': 8.014814814814814}
{'loss': 1.0252, 'grad_norm': 0.937930703163147, 'learning_rate': 3.957037037037037e-06, 'epoch': 8.022222222222222}
{'loss': 1.1333, 'grad_norm': 1.0089513063430786, 'learning_rate': 3.942222222222222e-06, 'epoch': 8.02962962962963}
{'loss': 1.2113, 'grad_norm': 0.8109027743339539, 'learning_rate': 3.927407407407408e-06, 'epoch': 8.037037037037036}
{'loss': 1.0621, 'grad_norm': 0.730042040348053, 'learning_rate': 3.9125925925925926e-06, 'epoch': 8.044444444444444}
{'loss': 1.1055, 'grad_norm': 0.6142730116844177, 'learning_rate': 3.897777777777778e-06, 'epoch': 8.051851851851852}
{'loss': 1.1413, 'grad_norm': 0.8304439783096313, 'learning_rate': 3.882962962962963e-06, 'epoch': 8.059259259259258}
{'loss': 1.1147, 'grad_norm': 0.706796407699585, 'learnin



{'loss': 1.1117, 'grad_norm': 0.8193462491035461, 'learning_rate': 1.9866666666666666e-06, 'epoch': 9.007407407407408}
{'loss': 1.0863, 'grad_norm': 1.0850257873535156, 'learning_rate': 1.971851851851852e-06, 'epoch': 9.014814814814814}
{'loss': 1.0747, 'grad_norm': 0.8376418948173523, 'learning_rate': 1.9570370370370374e-06, 'epoch': 9.022222222222222}
{'loss': 1.0875, 'grad_norm': 0.9985391497612, 'learning_rate': 1.9422222222222222e-06, 'epoch': 9.02962962962963}
{'loss': 1.1268, 'grad_norm': 1.1592857837677002, 'learning_rate': 1.9274074074074074e-06, 'epoch': 9.037037037037036}
{'loss': 1.2911, 'grad_norm': 0.909136950969696, 'learning_rate': 1.9125925925925926e-06, 'epoch': 9.044444444444444}
{'loss': 1.1118, 'grad_norm': 1.06625497341156, 'learning_rate': 1.8977777777777779e-06, 'epoch': 9.051851851851852}
{'loss': 1.1153, 'grad_norm': 1.2935144901275635, 'learning_rate': 1.882962962962963e-06, 'epoch': 9.059259259259258}
{'loss': 1.1072, 'grad_norm': 0.9742804169654846, 'learni

TrainOutput(global_step=13500, training_loss=4.241933167916757, metrics={'train_runtime': 42317.415, 'train_samples_per_second': 0.638, 'train_steps_per_second': 0.319, 'train_loss': 4.241933167916757, 'epoch': 10.0})

In [15]:
model.save_pretrained("./t5-rewrite-final")
tokenizer.save_pretrained("./t5-rewrite-final")

('./t5-rewrite-final\\tokenizer_config.json',
 './t5-rewrite-final\\special_tokens_map.json',
 './t5-rewrite-final\\spiece.model',
 './t5-rewrite-final\\added_tokens.json',
 './t5-rewrite-final\\tokenizer.json')

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_path = "./t5-rewrite-final"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [30]:
def rewrite(context, input_text):
    context_text = "\n".join(context)
    prompt = f"[REWRITE]\n{context_text}\n{input_text}"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(model.device)

    output_ids = model.generate(
        inputs["input_ids"],
        max_length=64,
        num_beams=5,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        top_p=0.92,
        top_k=50,
        do_sample=False,
        early_stopping=True
    )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

In [31]:
context = [
    "A: 나 배 아파",
    "B: 그래?",
    "A: 어제 많이 먹어서 그런가"
]

input_text = "B: 너가 돼지새끼여서 그런듯"

# 🔸 추론 함수 실행
rewritten_output = rewrite(context, input_text)

# 🔸 결과 출력
print("🟡 INPUT:", input_text)
print("🟢 OUTPUT:", rewritten_output)
    

The following generation flags are not valid and may be ignored: ['top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🟡 INPUT: B: 너가 돼지새끼여서 그런듯
🟢 OUTPUT: 큐브큐브 crossover crossover됩큐브 비일비재 비일비재 crossover 비일비재 베네 베네큐브 Panther 비일비재꽃보다꽃보다묶 비일비재쿨 비일비재핵심큐브CU 베네 비일비재큐브됩 비일비재 Colbert 비일비재ré 비일비재 큐브큐브 Papa 비일비재묶묶꽃보다 비일비재 명희숙 비일비재 Tab큐브 Booker큐브 각하 비일비재젊 비일비재 Panther큐브 베네 crossover 베네 전례 비일비재 주체별 비일비재 기압골 비일비재주신고가
