In [1]:
!pip install -q torchmetrics==1.3.1

In [2]:
!pip install -q evaluate rouge-score

In [3]:
!pip install -U bitsandbytes accelerate



In [4]:
pip install --upgrade transformers



In [7]:
%%writefile hangul_review_restorer.py

from __future__ import annotations
import os
# disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

import re, random, multiprocessing as mp
from dataclasses import dataclass
import numpy as np, pandas as pd, torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torchmetrics.text import CharErrorRate
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
)
import evaluate
import csv
import json

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

_H = re.compile(r"[^\uAC00-\uD7A3\s]")
_S = re.compile(r"\s+")
def norm(t: str) -> str:
    return _S.sub(" ", _H.sub(" ", t)).strip()

def load_csv(path: str, need_output: bool) -> pd.DataFrame:
    cols = ["input", "output"] if need_output else ["input"]
    for enc in ("cp949", "utf-8-sig", "euc-kr"):
        try:
            return pd.read_csv(path, encoding=enc)[cols].astype(str)
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError("csv", b"", 0, 1, "cannot decode with cp949/utf-8-sig/euc-kr")

# ------------------------------------------------------------------
# 자모 테이블
# ------------------------------------------------------------------
CHO = [chr(c) for c in range(0x1100, 0x1113)]
JUNG = [chr(c) for c in range(0x1161, 0x1176)]
JONG = ["∅"] + [chr(c) for c in range(0x11A8, 0x11C3)]
JAMO = CHO + JUNG + JONG[1:] + ["∅"]
J2I = {j: i for i, j in enumerate(JAMO)}
I2J = {i: j for j, i in J2I.items()}
HBASE = 0xAC00

def decomp(s):
    cd = ord(s) - HBASE
    return CHO[cd // 588], JUNG[(cd % 588) // 28], JONG[cd % 28]

def comp(c, j, jg="∅"):
    jdx = 0 if jg == "∅" else JONG.index(jg)
    return chr(HBASE + 588 * CHO.index(c) + 28 * JUNG.index(j) + jdx)

# ------------------------------------------------------------------
# Stage-1 : 자모 MLP
# ------------------------------------------------------------------
class JamoDS(Dataset):
    def __init__(self, df):
        self.x, self.y, self.pos = [], [], []
        for a, b in zip(df["input"], df["output"]):
            a, b = map(norm, (a, b))
            if len(a) != len(b):
                continue
            for i, (ca, cb) in enumerate(zip(a, b)):
                if not ("가" <= ca <= "힣"):
                    continue
                self.x.append([J2I[j] for j in decomp(ca)])
                self.y.append([J2I[j] for j in decomp(cb)])
                self.pos.append([i / max(1, len(a) - 1)])
        self.x = torch.tensor(self.x)
        self.y = torch.tensor(self.y)
        self.pos = torch.tensor(self.pos)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.x[i], self.pos[i], self.y[i]

class JamoMLP(nn.Module):
    def __init__(self, emb=96, hid=256):
        super().__init__()
        self.e = nn.Embedding(len(JAMO), emb // 2)
        self.p = nn.Sequential(
            nn.Linear(1, emb // 2),
            nn.GELU(),
            nn.Linear(emb // 2, emb // 2),
        )
        dim_in = (emb // 2) * 4
        self.m = nn.Sequential(
            nn.Linear(dim_in, hid),
            nn.GELU(),
            nn.Linear(hid, hid),
            nn.GELU(),
        )
        self.h = nn.Linear(hid, len(JAMO) * 3)

    def forward(self, x, pos):
        v = torch.cat([self.e(x).view(x.size(0), -1), self.p(pos)], -1)
        return self.h(self.m(v)).view(-1, 3, len(JAMO))

def train_stage1(model, ds, ep=15, bs=512, lr=1e-3, save="jamo.pth"):
    dl = DataLoader(ds, bs, shuffle=True, pin_memory=True)
    opt = AdamW(model.parameters(), lr=lr)
    ce = nn.CrossEntropyLoss()
    model.to(device)
    for e in range(1, ep + 1):
        tot = 0
        model.train()
        for x, p, y in dl:
            x, p, y = x.to(device), p.to(device), y.to(device)
            loss = sum(ce(model(x, p)[:, i], y[:, i]) for i in range(3)) / 3
            opt.zero_grad()
            loss.backward()
            opt.step()
            tot += loss.item() * x.size(0)
        print(f"[Stage-1] {e}/{ep} loss {tot/len(ds):.4f}")
    torch.save(model.cpu().state_dict(), save)

@torch.inference_mode()
def jamo_infer(model, text: str) -> str:
    model.eval()
    ch = list(text)
    for i, c in enumerate(ch):
        if "가" <= c <= "힣":
            x = torch.tensor([[J2I[j] for j in decomp(c)]]).to(device)
            p = torch.tensor([[i / max(1, len(ch) - 1)]]).to(device)
            pred = model(x, p).argmax(-1)[0]
            ch[i] = comp(
                I2J[pred[0].item()],
                I2J[pred[1].item()],
                I2J[pred[2].item()],
            )
    return "".join(ch)

# ------------------------------------------------------------------
# Stage-2 : KoBART Seq2Seq
# ------------------------------------------------------------------
@dataclass
class Cfg:
    model: str = "hyunwoongko/kobart"
    max_len: int = 512
    bs: int = 4
    lr: float = 2e-5
    epochs: int = 1
    alpha: float = 0.2

C = Cfg()

class TrainerED(Seq2SeqTrainer):
    def __init__(self, *a, alpha=0.2, **k):
        super().__init__(*a, **k)
        self.alpha = alpha
        self.cer = CharErrorRate().to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        lab = inputs.pop("labels")
        out = model(**inputs, labels=lab)
        loss = out.loss + self.alpha * self.cer(
            out.logits.argmax(-1), lab
        )
        return (loss, out) if return_outputs else loss

def train_stage2(tr_df, va_df, save_dir):
    tok = AutoTokenizer.from_pretrained(C.model)
    mdl = AutoModelForSeq2SeqLM.from_pretrained(C.model)

    from datasets import Dataset
    tr = (
        Dataset.from_pandas(tr_df)
        .map(lambda e: tok(text=e["input"], text_target=e["output"],
                          max_length=C.max_len, truncation=True),
             batched=True, num_proc=mp.cpu_count())
    )
    va = (
        Dataset.from_pandas(va_df)
        .map(lambda e: tok(text=e["input"], text_target=e["output"],
                          max_length=C.max_len, truncation=True),
             batched=True, num_proc=mp.cpu_count())
    )

    coll = DataCollatorForSeq2Seq(tok, mdl, padding="longest")

    # 학습 설정
    args = Seq2SeqTrainingArguments(
        output_dir=save_dir,
        per_device_train_batch_size=C.bs,
        per_device_eval_batch_size=C.bs,
        eval_strategy="steps",
        save_strategy="steps",
        eval_steps=500,
        save_steps=500,
        save_total_limit=2,
        num_train_epochs=C.epochs,
        learning_rate=C.lr,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        logging_steps=100,
        seed=SEED,
        predict_with_generate=True,
        report_to="none",
    )

    TrainerED(
        model=mdl,
        args=args,
        train_dataset=tr,
        eval_dataset=va,
        data_collator=coll,
        tokenizer=tok,
        alpha=C.alpha,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    ).train()

    mdl.save_pretrained(save_dir)
    tok.save_pretrained(save_dir)

# ------------------------------------------------------------------
# 파이프라인
# ------------------------------------------------------------------
def run_pipeline(train_csv, test_csv, sub_csv, work_dir="./work"):
    os.makedirs(work_dir, exist_ok=True)

    # Stage-1 ---------------------------------------------------------
    jamo_ckpt = f"{work_dir}/jamo.pth"
    if os.path.exists(jamo_ckpt):
        print("🔹 Stage-1 가중치 존재 → 학습 건너뜀")
        stage1 = JamoMLP()
        stage1.load_state_dict(torch.load(jamo_ckpt, map_location="cpu"))
    else:
        full_df = load_csv(train_csv, need_output=True)
        full_df["input"] = full_df["input"].apply(norm)
        full_df["output"] = full_df["output"].apply(norm)
        tr_df, _ = train_test_split(full_df, test_size=0.1, random_state=SEED)
        stage1 = JamoMLP()
        train_stage1(stage1, JamoDS(tr_df), save=jamo_ckpt)

    # Stage-2 ---------------------------------------------------------
    ft_dir = f"{work_dir}/kobart_ft"
    tok_file = os.path.join(ft_dir, "tokenizer_config.json")

    if not os.path.isdir(ft_dir) or not os.path.isfile(tok_file):
        full_df = load_csv(train_csv, need_output=True)
        full_df["input"] = full_df["input"].apply(norm)
        full_df["output"] = full_df["output"].apply(norm)
        tr_df, va_df = train_test_split(full_df, test_size=0.1, random_state=SEED)
        train_stage2(tr_df, va_df, ft_dir)
    else:
        print("🔹 Stage-2 이미 학습된 파일 존재 → 건너뜀")

    # 토크나이저 & 모델 로드
    tok = AutoTokenizer.from_pretrained(ft_dir)
    seq = AutoModelForSeq2SeqLM.from_pretrained(ft_dir).to(device)

    stage1 = stage1.to(device)

    # 추론 & 제출 ----------------------------------------------------
    test_df = load_csv(test_csv, need_output=False)
    outs = []
    for txt in tqdm(test_df["input"], desc="Restoring"):
        coarse = jamo_infer(stage1, txt)
        ids = tok(coarse, return_tensors="pt", truncation=True,
                  max_length=C.max_len).to(device)
        gen = seq.generate(**ids, max_new_tokens=C.max_len)
        outs.append(tok.decode(gen[0], skip_special_tokens=True))

    pd.DataFrame({"output": outs}).to_csv(sub_csv, index=False, encoding="utf-8-sig")
    print("✅ submission saved →", sub_csv)

Overwriting hangul_review_restorer.py


In [8]:
import importlib, hangul_review_restorer
importlib.reload(hangul_review_restorer)

from hangul_review_restorer import run_pipeline

run_pipeline(
    train_csv="/content/drive/MyDrive/data/train.csv",
    test_csv="/content/drive/MyDrive/data/test.csv",
    sub_csv="/content/drive/MyDrive/data/submission.csv",
    work_dir="/content/work"
)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels will be overwritten to 2.


🔹 Stage-1 가중치 존재 → 학습 건너뜀
🔹 Stage-2 이미 학습된 파일 존재 → 건너뜀


Restoring:   0%|          | 0/1689 [00:00<?, ?it/s]

✅ submission saved → /content/drive/MyDrive/data/submission.csv


In [14]:
import pandas as pd

pred_path  = "/content/drive/MyDrive/data/submission.csv"  
pred_df    = pd.read_csv(pred_path, encoding="utf-8-sig")

id_source  = "/content/drive/MyDrive/data/sample_submission.csv"
id_df      = pd.read_csv(id_source, encoding="utf-8-sig")

assert len(pred_df) == len(id_df), "행 개수 불일치!"

fixed_df = pd.DataFrame({
    "ID":     id_df["ID"],
    "output": pred_df["output"]
})

fixed_df.to_csv("/content/drive/MyDrive/data/submission_fixed.csv",
                index=False, encoding="utf-8-sig")
print("✅ saved submission_fixed.csv")


✅ saved submission_fixed.csv


In [17]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torchmetrics import CharErrorRate
import evaluate
import torch.nn as nn
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = "/content/work/kobart_ft"
train_csv = "/content/drive/MyDrive/data/train.csv"

tok   = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)
model.eval()

df_full = pd.read_csv(train_csv, encoding="utf-8-sig")
_, val_df = train_test_split(df_full, test_size=0.1, random_state=42)

texts = val_df["input"].tolist()
refs  = val_df["output"].tolist()

# ─── 지표 객체 준비 ───────────────
bleu_metric  = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
cer_metric   = CharErrorRate().to(device)
ce_loss_fn   = nn.CrossEntropyLoss(ignore_index=-100)

# ─── 예측 생성 & 자동 지표 계산 ──────────
preds = []
for txt in tqdm(texts, desc="Generating preds"):
    inputs  = tok(txt, return_tensors="pt",
                  truncation=True, max_length=512).to(device)
    gen_ids  = model.generate(**inputs, max_new_tokens=512)
    preds.append(tok.decode(gen_ids[0], skip_special_tokens=True))

bleu   = bleu_metric.compute(predictions=preds,
                             references=[[r] for r in refs])["bleu"]
rougeL = rouge_metric.compute(predictions=preds,
                              references=refs)["rougeL"]
cer    = cer_metric(preds, refs).item()

print(f"\nBLEU:    {bleu:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")
print(f"CER:     {cer:.4f}")

# ─── 손실(CE / ED Loss) 계산 ──────────
ce_losses = []
ed_losses = []

for txt, tgt in tqdm(zip(texts, refs), desc="Computing losses", total=len(texts)):
    enc = tok(txt,  return_tensors="pt",
              truncation=True, max_length=512).to(device)
    lbl = tok(tgt,  return_tensors="pt",
              truncation=True, max_length=512).to(device)

    outputs = model(**enc, labels=lbl["input_ids"])
    ce = outputs.loss.item()

    pred_ids  = outputs.logits.argmax(-1)
    pred_str  = tok.decode(pred_ids[0], skip_special_tokens=True)
    ed        = cer_metric([pred_str], [tgt]).item()

    ce_losses.append(ce)
    ed_losses.append(ed)

print(f"\nAvg CE Loss:   {np.mean(ce_losses):.4f}")
print(f"Avg ED Loss:   {np.mean(ed_losses):.4f}")
print(f"Combined Loss: {np.mean(ce_losses) + 0.2*np.mean(ed_losses):.4f}")


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels will be overwritten to 2.


Generating preds:   0%|          | 0/1127 [00:00<?, ?it/s]


BLEU:    0.1110
ROUGE-L: 0.1112
CER:     0.5060


Computing losses:   0%|          | 0/1127 [00:00<?, ?it/s]


Avg CE Loss:   2.1076
Avg ED Loss:   0.3313
Combined Loss: 2.1739
