In [None]:
# ==== 전맥락 + 화자태깅 학습 스크립트 ====
import os, json, random, numpy as np, torch, re, glob
from typing import List
import pandas as pd
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
)

In [None]:
# (1) 혹시 모를 잔여 마운트 해제
!fusermount -u /content/drive 2>/dev/null || true
!umount /content/drive 2>/dev/null || true

# (2) 로컬에 남아있는 가짜 /content/drive 폴더/파일 제거 후 재생성
!rm -rf /content/drive
import os
os.makedirs("/content/drive", exist_ok=True)

# (3) 강제 재마운트
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# (4) 확인
import os
print("is mount?", os.path.ismount("/content/drive"))
!ls -la /content/drive | head -n 20
!ls -la /content/drive/MyDrive | head -n 20

KeyboardInterrupt: 

In [None]:
# -------------------- 0) 공통 설정 --------------------
os.environ["HF_ALLOW_CODE_EXECUTION"] = "1"
os.environ["TRANSFORMERS_ALLOW_CODE_EXECUTION"] = "1"

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# *** 필요한 경로만 바꿔 ***
DATA_PATH = "/content/drive/MyDrive/metrics/혐오조롱표현탐지용데이터 15400개.jsonl"
BASE_DIR  = "/content/drive/MyDrive/model_backup/results_ctx_tgt_best"
os.makedirs(BASE_DIR, exist_ok=True)


Device: cuda


In [None]:
print("after mount -> drive exists?", os.path.isdir("/content/drive"))
print("MyDrive exists?", os.path.isdir("/content/drive/MyDrive"))

after mount -> drive exists? True
MyDrive exists? True


In [None]:
def load_jsonl(path: str):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            s = line.strip()
            if not s: continue
            try:
                rows.append(json.loads(s))
            except json.JSONDecodeError as e:
                print(f"[WARN] JSON decode error at line {lineno}: {e}")
    return rows

def map_label(lbl: str) -> int:
    if not lbl: return 0
    return 1 if str(lbl).strip().lower() == "toxic" else 0

def ensure_speaker_prefixes(utts: List[str]) -> List[str]:
    out = []
    for i, u in enumerate(utts):
        s = u.strip()
        if s.startswith("A:") or s.startswith("B:"):
            out.append(s)
        else:
            # 규칙이 없다면 A/B 번갈아 태깅 (데이터 포맷에 맞게 수정 가능)
            out.append(("A: " if i % 2 == 0 else "B: ") + s)
    return out

def get_speaker(line: str) -> str:
    line = line.strip()
    if line.startswith("A:"): return "A"
    if line.startswith("B:"): return "B"
    return "U"

def build_input(utterances: List[str], k: int) -> str:
    """
    전맥락 [CTX] ... [/CTX] + 타깃 화자 [TGT_SPK=X] ... [/TGT]
    """
    utts = ensure_speaker_prefixes(utterances)
    ctx = utts[:k]
    tgt = utts[k]
    tgt_spk = get_speaker(tgt)
    parts = []
    if ctx:
        parts.append("[CTX]")
        parts.extend(ctx)
        parts.append("[/CTX]")
    parts.append(f"[TGT_SPK={tgt_spk}] {tgt} [/TGT]")
    return "\n".join(parts)

raw = load_jsonl(DATA_PATH)
print(f"✅ 데이터 로드: {len(raw)} rows")

records = []
bad = 0
for r in raw:
    utts = r.get("utterances") or r.get("context") or []
    idx = r.get("target_index", None)
    lbl = r.get("label", None)
    if not isinstance(utts, list) or idx is None or idx < 0 or idx >= len(utts):
        bad += 1
        continue
    records.append({
        "dialogue_id": r.get("dialogue_id", r.get("id", "")),
        "text": build_input(utts, idx),
        "label": map_label(lbl)
    })
if bad:
    print(f"⚠️ 무시된 레코드: {bad}")

df = pd.DataFrame(records)
print(df.head(2))
print("라벨 분포:\n", df["label"].value_counts())

def load_tok_and_model():
    try:
        tok = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "monologg/kobert", trust_remote_code=True, num_labels=2
        )
        print("✅ KoBERT 로드")
    except Exception as e:
        print("❌ KoBERT 실패 → mBERT로 대체:", e)
        tok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-multilingual-cased", num_labels=2
        )
        print("✅ mBERT 로드")
    return tok, mdl

tokenizer, _ = load_tok_and_model()
orig_save_vocab = getattr(tokenizer, "save_vocabulary", None)
if callable(orig_save_vocab):
    def _patched_save_vocabulary(save_directory, *args, **kwargs):
        if "filename_prefix" in kwargs: kwargs.pop("filename_prefix")
        return orig_save_vocab(save_directory, *args, **kwargs)
    tokenizer.save_vocabulary = _patched_save_vocabulary
# 스페셜 토큰 등록 (권장)
SPECIAL_TOKENS = {
    "additional_special_tokens": ["[CTX]", "[/CTX]", "[TGT_SPK=A]", "[TGT_SPK=B]", "[TGT_SPK=U]", "[TGT]", "[/TGT]"]
}
tokenizer.add_special_tokens(SPECIAL_TOKENS)

def model_init():
    # KoBERT 우선, 실패 시 mBERT 대체 (원래 함수 재사용)
    _, mdl = load_tok_and_model()
    # 스페셜 토큰 반영
    mdl.resize_token_embeddings(len(tokenizer))
    # 레이블 매핑 유지
    mdl.config.label2id = {"non_toxic": 0, "toxic": 1}
    mdl.config.id2label = {0: "non_toxic", 1: "toxic"}
    return mdl


# -------------------- 5) 메트릭/트레이너 --------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    out = {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
    }
    try:
        prob_pos = (np.exp(logits) / np.exp(logits).sum(-1, keepdims=True))[:, 1]
        out["roc_auc"] = roc_auc_score(labels, prob_pos)
    except Exception:
        out["roc_auc"] = float("nan")
    return out


# -------------------- 4) 가중치 & Focal Loss --------------------
pos = int((train_df["label"] == 1).sum())
neg = int((train_df["label"] == 0).sum())
ratio = neg / max(1, pos)
w_pos = float(min(3.0, max(1.0, ratio)))  # 1~3로 clamp (과탐 방지)
class_weights = torch.tensor([1.0, w_pos], dtype=torch.float32).to(device)

import torch.nn.functional as F
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=1.5, reduction="mean"):
        super().__init__()
        self.alpha = alpha  # class_weights 텐서
        self.gamma = gamma
        self.reduction = reduction
    def forward(self, logits, target):
        ce = F.cross_entropy(logits, target, weight=self.alpha, reduction="none")
        pt = torch.softmax(logits, dim=-1).gather(1, target.view(-1,1)).squeeze(1).clamp_(1e-6, 1-1e-6)
        loss = ((1-pt) ** self.gamma) * ce
        return loss.mean() if self.reduction=="mean" else loss.sum()

focal = FocalLoss(alpha=class_weights, gamma=1.5)

def custom_compute_loss(model, inputs, return_outputs=False, **kwargs):
    labels = inputs.get("labels")
    outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
    logits = outputs.logits
    loss = focal(logits.view(-1, 2), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

# -------------------- 3) Dataset/Tokenize --------------------
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=False)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
train_ds = HFDataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = HFDataset.from_pandas(val_df.reset_index(drop=True))

remove_cols = [c for c in ["text", "dialogue_id", "__index_level_0__"] if c in train_ds.column_names]
train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=remove_cols)
val_ds   = val_ds.map(tokenize_fn,   batched=True, remove_columns=remove_cols)
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def hp_space_optuna(trial):
    return {
        # loguniform: 1e-5 ~ 5e-5 범위에서 학습률 탐색
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        # 배치·가적은 데이터/메모리 균형용
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8]),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.12),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.05, 0.30),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type",
                              ["linear", "cosine", "constant_with_warmup", "polynomial"]),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.5, 1.0),
        # 에폭도 가볍게 탐색 (과적합 방지 위해 3~5)
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
    }

def compute_objective(metrics):
    # f1 기준 최대화
    return metrics.get("eval_f1", 0.0)

# ==================== [HPO] Trainer (탐색용) ====================
# 주의: HPO에서는 model 대신 model_init을 전달해야 함
hpo_args = TrainingArguments(
    output_dir=os.path.join(BASE_DIR, "hpo"),
    logging_dir=os.path.join(BASE_DIR, "hpo_logs"),
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=False,     # 탐색에서 best 로드는 비활성화
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    report_to=[]
)

hpo_trainer = Trainer(
    model_init=model_init,
    args=hpo_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
# focal loss 적용 (기존과 동일)
hpo_trainer.compute_loss = custom_compute_loss

print("🔎 HPO 시작 (Optuna)")
best_run = hpo_trainer.hyperparameter_search(
    hp_space=hp_space_optuna,
    direction="maximize",
    compute_objective=compute_objective,
    n_trials=20,          # 필요에 따라 늘리기(시간↑)
    backend="optuna",     # 기본 optuna
)
print("🏁 HPO 완료:", best_run)

# 결과 저장
with open(os.path.join(BASE_DIR, "best_params.json"), "w", encoding="utf-8") as f:
    json.dump(best_run.hyperparameters, f, ensure_ascii=False, indent=2)

BEST_PARAMS.update({
    "learning_rate": best_run.hyperparameters["learning_rate"],
    "per_device_train_batch_size": best_run.hyperparameters["per_device_train_batch_size"],
    "per_device_eval_batch_size": best_run.hyperparameters["per_device_eval_batch_size"],
    "gradient_accumulation_steps": best_run.hyperparameters["gradient_accumulation_steps"],
    "weight_decay": best_run.hyperparameters["weight_decay"],
    "warmup_ratio": best_run.hyperparameters["warmup_ratio"],
    "lr_scheduler_type": best_run.hyperparameters["lr_scheduler_type"],
})
NUM_TRAIN_EPOCHS = int(best_run.hyperparameters.get("num_train_epochs", NUM_TRAIN_EPOCHS))









✅ 데이터 로드: 15211 rows
  dialogue_id                                               text  label
0         001  [CTX]\nA: 부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네\nB: 맞아. ...      1
1         002  [CTX]\nA: 인방 보는 남자는 거르는게 맞다\nB: 특히 벗방보는 애들은 진짜...      0
라벨 분포:
 label
0    7957
1    7254
Name: count, dtype: int64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Map:   0%|          | 0/12168 [00:00<?, ? examples/s]

Map:   0%|          | 0/3043 [00:00<?, ? examples/s]

  hpo_trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


[I 2025-08-14 02:09:57,283] A new study created in memory with name: no-name-4a4843db-b59e-43fc-9793-a0b56283c9a0


🔎 HPO 시작 (Optuna)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.059,0.062201,0.957279,0.974838,0.934528,0.954258,0.993421
2,0.0357,0.090378,0.950707,0.977256,0.917988,0.946695,0.993421
3,0.0172,0.08074,0.958265,0.976259,0.935217,0.955297,0.994876


[I 2025-08-14 02:13:53,410] Trial 0 finished with value: 0.9552974304822246 and parameters: {'learning_rate': 1.319378718706203e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 2, 'weight_decay': 0.10962598352031049, 'warmup_ratio': 0.051246968699662845, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.7325620670571218, 'num_train_epochs': 3}. Best is trial 0 with value: 0.9552974304822246.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0762,0.169657,0.882682,0.984071,0.766368,0.861682,0.990776
2,0.0396,0.049267,0.970424,0.959487,0.979325,0.969304,0.994577
3,0.0159,0.047734,0.976011,0.967436,0.982771,0.975043,0.995235
4,0.0011,0.068654,0.969767,0.972203,0.964163,0.968166,0.99475


[I 2025-08-14 02:19:13,904] Trial 1 finished with value: 0.9681660899653979 and parameters: {'learning_rate': 3.133444371761284e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 2, 'weight_decay': 0.01385857556212863, 'warmup_ratio': 0.23621876893445004, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.9305827512379382, 'num_train_epochs': 4}. Best is trial 1 with value: 0.9681660899653979.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2255,0.082072,0.932632,0.978495,0.878015,0.925536,0.989614
2,0.0461,0.042972,0.964837,0.969274,0.956582,0.962886,0.992924
3,0.0286,0.044302,0.96188,0.968421,0.951068,0.959666,0.99367
4,0.0183,0.043638,0.965823,0.969993,0.95796,0.963939,0.993783


[I 2025-08-14 02:21:56,978] Trial 2 finished with value: 0.9639389736477115 and parameters: {'learning_rate': 1.5936149491629827e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 8, 'weight_decay': 0.0898588124052949, 'warmup_ratio': 0.13607602412400654, 'lr_scheduler_type': 'cosine', 'max_grad_norm': 0.9901711153532169, 'num_train_epochs': 4}. Best is trial 1 with value: 0.9681660899653979.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2238,0.050645,0.953664,0.945578,0.95796,0.951729,0.987713
2,0.0487,0.048145,0.958265,0.974212,0.937285,0.955392,0.994282
3,0.0245,0.043409,0.967138,0.972048,0.958649,0.965302,0.994534
4,0.0142,0.076807,0.955636,0.972023,0.933839,0.952548,0.993833
5,0.0092,0.068021,0.962537,0.970443,0.950379,0.960306,0.993401


[I 2025-08-14 02:25:19,535] Trial 3 finished with value: 0.9603064066852368 and parameters: {'learning_rate': 3.041536388981852e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 8, 'weight_decay': 0.08105201872285554, 'warmup_ratio': 0.1961631681881435, 'lr_scheduler_type': 'constant_with_warmup', 'max_grad_norm': 0.8076288957403834, 'num_train_epochs': 5}. Best is trial 1 with value: 0.9681660899653979.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0566,0.069334,0.940519,0.974589,0.898691,0.935102,0.991848
2,0.0427,0.038383,0.970753,0.958895,0.980703,0.969676,0.994255
3,0.0151,0.043761,0.970753,0.971607,0.966919,0.969257,0.995333
4,0.0034,0.05968,0.965823,0.972632,0.955203,0.963839,0.995091


[I 2025-08-14 02:30:27,071] Trial 4 finished with value: 0.9638386648122392 and parameters: {'learning_rate': 2.6424958947429168e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 4, 'weight_decay': 0.09195722341351406, 'warmup_ratio': 0.28969701629752054, 'lr_scheduler_type': 'cosine', 'max_grad_norm': 0.8963893519373398, 'num_train_epochs': 4}. Best is trial 1 with value: 0.9681660899653979.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0557,0.076038,0.940519,0.981061,0.892488,0.934681,0.993075


[I 2025-08-14 02:31:13,979] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0616,0.072019,0.945777,0.979851,0.904893,0.940881,0.993171
2,0.048,0.049477,0.967466,0.961119,0.971054,0.966061,0.993488
3,0.024,0.063235,0.967795,0.9507,0.98346,0.966802,0.993792
4,0.0115,0.073877,0.966809,0.976023,0.953825,0.964796,0.993867


[I 2025-08-14 02:34:27,797] Trial 6 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0534,0.070717,0.95235,0.973875,0.924879,0.948745,0.993113
2,0.0316,0.080762,0.955307,0.976795,0.928325,0.951943,0.993976


[I 2025-08-14 02:37:09,900] Trial 7 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2461,0.092493,0.912258,0.975884,0.836664,0.900928,0.984896


[I 2025-08-14 02:37:53,380] Trial 8 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0566,0.050452,0.957936,0.969482,0.94142,0.955245,0.991921
2,0.0393,0.037767,0.967795,0.970118,0.962095,0.96609,0.994177


[I 2025-08-14 02:39:21,370] Trial 9 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0444,0.148934,0.922445,0.984064,0.851137,0.912786,0.992133
2,0.0521,0.045795,0.974039,0.964141,0.982081,0.973028,0.994966
3,0.0084,0.062017,0.970095,0.974198,0.962784,0.968458,0.995374


[I 2025-08-14 02:43:45,188] Trial 10 finished with value: 0.9684575389948007 and parameters: {'learning_rate': 3.3488606479071704e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.04475501618361846, 'warmup_ratio': 0.16345827033905022, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.9850833568331282, 'num_train_epochs': 3}. Best is trial 10 with value: 0.9684575389948007.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0452,0.068181,0.968781,0.964384,0.970365,0.967365,0.992873
2,0.0635,0.064027,0.964837,0.97191,0.953825,0.962783,0.994848


[I 2025-08-14 02:46:40,221] Trial 11 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0634,0.124083,0.930003,0.979845,0.871123,0.922291,0.992113
2,0.0487,0.063653,0.967466,0.975387,0.955892,0.965541,0.995292


[I 2025-08-14 02:49:36,435] Trial 12 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0517,0.05478,0.957608,0.971469,0.938663,0.954784,0.992753
2,0.0325,0.064933,0.94512,0.980539,0.902826,0.940079,0.994898


[I 2025-08-14 02:52:09,890] Trial 13 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0633,0.063657,0.958593,0.975592,0.936595,0.955696,0.993908
2,0.0625,0.069895,0.960237,0.927378,0.994487,0.959761,0.992914


[I 2025-08-14 02:55:05,570] Trial 14 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0661,0.161998,0.915215,0.982215,0.837354,0.904018,0.989351


[I 2025-08-14 02:56:27,716] Trial 15 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0618,0.092014,0.955307,0.977487,0.927636,0.951909,0.993038
2,0.0651,0.052252,0.967138,0.960464,0.971054,0.96573,0.993924


[I 2025-08-14 02:59:23,161] Trial 16 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0461,0.068818,0.950049,0.972364,0.921433,0.946214,0.991679
2,0.0354,0.045026,0.965495,0.958447,0.969676,0.964029,0.993656


[I 2025-08-14 03:01:58,268] Trial 17 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0555,0.079485,0.950707,0.977256,0.917988,0.946695,0.992719
2,0.0594,0.051295,0.962208,0.936601,0.987595,0.961422,0.992585


[I 2025-08-14 03:04:50,338] Trial 18 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0521,0.098292,0.940519,0.974589,0.898691,0.935102,0.99183
2,0.0358,0.077037,0.948735,0.974359,0.916609,0.944602,0.992981


[I 2025-08-14 03:07:33,561] Trial 19 pruned. 


🏁 HPO 완료: BestRun(run_id='10', objective=0.9684575389948007, hyperparameters={'learning_rate': 3.3488606479071704e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.04475501618361846, 'warmup_ratio': 0.16345827033905022, 'lr_scheduler_type': 'linear', 'max_grad_norm': 0.9850833568331282, 'num_train_epochs': 3}, run_summary=None)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/model_backup/results_ctx_tgt_best/best_params.json'

In [None]:
BEST_PARAMS = {
    "learning_rate": 3.3488606479071704e-05,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 16,
    "gradient_accumulation_steps": 1,
    "weight_decay": 0.04475501618361846,
    "warmup_ratio": 0.16345827033905022,
    "lr_scheduler_type": "linear",
    "max_grad_norm": 0.9850833568331282
}

NUM_TRAIN_EPOCHS = 3  # HPO 결과 반영

final_args = TrainingArguments(
    output_dir=BASE_DIR,
    logging_dir=os.path.join(BASE_DIR, "logs"),
    eval_strategy="epoch",  # eval_strategy → evaluation_strategy (정식 파라미터명)
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    **BEST_PARAMS,
    report_to=[]
)

final_trainer = Trainer(
    model_init=model_init,  # 최종 학습도 동일 초기화 경로 사용
    args=final_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)
final_trainer.compute_loss = custom_compute_loss

print("🚀 최종 학습 시작 (HPO 최적 파라미터 적용)")
final_trainer.train()
print("✅ 최종 학습 종료")

import numpy as np
def find_best_threshold(trainer, val_ds, target_metric="f1"):
    preds = trainer.predict(val_ds)
    logits = preds.predictions
    labels = preds.label_ids
    prob_toxic = (np.exp(logits) / np.exp(logits).sum(-1, keepdims=True))[:, 1]
    best_t, best = 0.5, -1
    for t in np.linspace(0.05, 0.95, 19):
        yhat = (prob_toxic >= t).astype(int)
        m = {
            "accuracy": accuracy_score(labels, yhat),
            "precision": precision_score(labels, yhat, zero_division=0),
            "recall": recall_score(labels, yhat, zero_division=0),
            "f1": f1_score(labels, yhat, zero_division=0),
        }[target_metric]
        if m > best:
            best, best_t = m, t
    return float(best_t)

best_t = find_best_threshold(final_trainer, val_ds, target_metric="f1")
os.makedirs(BASE_DIR, exist_ok=True)
with open(os.path.join(BASE_DIR, "threshold.json"), "w", encoding="utf-8") as f:
    json.dump({"threshold": best_t}, f, ensure_ascii=False, indent=2)
print("✅ 최적 threshold 저장:", best_t)


final_trainer.save_model(BASE_DIR)           # 모델 저장
tokenizer.save_pretrained(BASE_DIR)          # 토크나이저 저장
print("✅ 모델과 토크나이저 저장 완료:", BASE_DIR)

  final_trainer = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드
🚀 최종 학습 시작 (HPO 최적 파라미터 적용)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.0685,0.148934,0.922445,0.984064,0.851137,0.912786,0.992133
2,0.0419,0.045795,0.974039,0.964141,0.982081,0.973028,0.994966
3,0.0141,0.062914,0.969767,0.974843,0.961406,0.968078,0.995384


✅ 최종 학습 종료


✅ 최적 threshold 저장: 0.44999999999999996
✅ 모델과 토크나이저 저장 완료: /content/drive/MyDrive/model_backup/results_ctx_tgt_best


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ==== 전맥락 + 화자태깅 추론 스크립트 ====
import os, json, torch, re
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# (학습과 동일 경로)
BASE_DIR = "/content/drive/MyDrive/model_backup/results_ctx_tgt_best"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_threshold(model_dir, default=0.1):
    p = os.path.join(model_dir, "threshold.json")
    if os.path.exists(p):
        try:
            return float(json.load(open(p, "r", encoding="utf-8"))["threshold"])
        except Exception:
            pass
    return default

def ensure_speaker_prefixes(utts):
    out = []
    for i, u in enumerate(utts):
        s = u.strip()
        if s.startswith("A:") or s.startswith("B:"):
            out.append(s)
        else:
            out.append(("A: " if i % 2 == 0 else "B: ") + s)
    return out

def get_speaker(line: str) -> str:
    line = line.strip()
    if line.startswith("A:"): return "A"
    if line.startswith("B:"): return "B"
    return "U"

def build_input_for_infer(utterances):
    assert len(utterances) >= 2, "최소 2줄 이상 필요"
    utts = ensure_speaker_prefixes(utterances)
    k = len(utts) - 1
    ctx_lines = utts[:k]
    tgt_line  = utts[k]
    tgt_spk   = get_speaker(tgt_line)
    parts = []
    if ctx_lines:
        parts.append("[CTX]")
        parts.extend(ctx_lines)
        parts.append("[/CTX]")
    parts.append(f"[TGT_SPK={tgt_spk}] {tgt_line} [/TGT]")
    return "\n".join(parts)

# (선택) 응급 가드 – 고백/연애 맥락의 "~줄래?" 과탐 완화
LOVE_SAFE = re.compile(
    r"(사랑|좋아|결혼|연애|고백|프로포즈).*(줄래|주겠니|줄\s*수\s*있어)\??$",
    re.IGNORECASE
)
def apply_guards(formatted_text: str, pred_label: str, prob_toxic: float, threshold: float) -> str:
    if pred_label == "toxic" and LOVE_SAFE.search(formatted_text.replace("\n", " ")):
        if prob_toxic < max(0.9, threshold + 0.3):
            return "non_toxic"
    return pred_label

# 로드
tokenizer = AutoTokenizer.from_pretrained(BASE_DIR, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR, trust_remote_code=True).to(DEVICE).eval()
TH = load_threshold(BASE_DIR, default=0.5)

@torch.inference_mode()
def predict_toxic_with_context(utterances, threshold=None, use_guard=True):
    """
    utterances: ["A: ...", "B: ...", "A: ...", "B: ..."] (길이 2 이상 자유)
    threshold: None이면 저장된 best threshold 사용
    """
    if threshold is None:
        threshold = TH

    formatted = build_input_for_infer(utterances)
    inputs = tokenizer(formatted, truncation=True, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).squeeze(0).tolist()
    prob_non_toxic, prob_toxic = float(probs[0]), float(probs[1])

    pred_id = int(prob_toxic >= threshold)
    pred_label = {0:"non_toxic", 1:"toxic"}[pred_id]

    if use_guard:
        pred_label = apply_guards(formatted, pred_label, prob_toxic, threshold)

    return {
        "use_context": "ALL",
        "text": formatted,
        "prob_toxic": prob_toxic,
        "threshold": threshold,
        "pred_label": pred_label
    }

# ---- 콘솔 입력 예시 ----
if __name__ == "__main__":
    print("A→B→A→B 순서로 최대 4줄(혹은 그 이상) 입력하세요. (빈 줄로 종료)")
    lines = []
    for i in range(8):  # 원하면 더 길게 입력
        s = input(f"{i+1}번째 줄: ").strip()
        if not s: break
        lines.append(s)

    if len(lines) < 2:
        raise ValueError("최소 2줄 이상 입력해야 합니다.")

    out = predict_toxic_with_context(lines)
    print("\n=== 결과 ===")
    print("입력(모델에 투입된 실제 문자열):")
    print(out["text"])
    print("\n예측:")
    print({"prob_toxic": out["prob_toxic"], "threshold": out["threshold"], "pred_label": out["pred_label"]})


A→B→A→B 순서로 최대 4줄(혹은 그 이상) 입력하세요. (빈 줄로 종료)
1번째 줄: A: 오늘 비 오니까 국물 있는거 먹자
2번째 줄: B: 칼국수 어때?
3번째 줄: A: 좋지!
4번째 줄: B: 넌 면보다 질척거림 ㅋㅋ
5번째 줄: 

=== 결과 ===
입력(모델에 투입된 실제 문자열):
[CTX]
A: 오늘 비 오니까 국물 있는거 먹자
B: 칼국수 어때?
A: 좋지!
[/CTX]
[TGT_SPK=B] B: 넌 면보다 질척거림 ㅋㅋ [/TGT]

예측:
{'prob_toxic': 0.012123801745474339, 'threshold': 0.44999999999999996, 'pred_label': 'non_toxic'}


In [None]:
import json

data_new = []
with open("/content/drive/MyDrive/metrics/추가데이터 150개#1.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:  # 빈 줄 건너뛰기
            continue
        try:
            data_new.append(json.loads(line))
        except json.JSONDecodeError as e:
            print("❌ JSONDecodeError 발생 줄:", line)
            raise e

print(len(data_new), "개 로드 완료")

150 개 로드 완료


In [None]:
NEW_DATA_PATH = "/content/drive/MyDrive/metrics/추가데이터 150개#1.jsonl"
BASE_DIR = "/content/drive/MyDrive/model_backup/results_ctx_tgt_best"
# ===== 2. JSONL 로드 =====
data_new = load_jsonl(NEW_DATA_PATH)
print(f"📂 추가 데이터 로드: {len(data_new)} rows")

# ===== 3. 기존 전처리 재사용 =====
records_new = []
bad = 0
for r in data_new:
    utts = r.get("utterances") or r.get("context") or []
    idx = r.get("target_index", None)
    lbl = r.get("label", None)
    if not isinstance(utts, list) or idx is None or idx < 0 or idx >= len(utts):
        bad += 1
        continue
    records_new.append({
        "dialogue_id": r.get("dialogue_id", r.get("id", "")),
        "text": build_input(utts, idx),
        "label": map_label(lbl)
    })
if bad:
    print(f"⚠️ 무시된 레코드: {bad}")

# ===== 4. Dataset 변환 =====
df_new = pd.DataFrame(records_new)
print("라벨 분포:\n", df_new["label"].value_counts())

new_ds = HFDataset.from_pandas(df_new.rename(columns={"label": "labels"}))

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

new_ds = new_ds.map(tokenize, batched=True)
keep_cols = ["input_ids", "attention_mask", "labels"]
new_ds.set_format(type="torch", columns=keep_cols)

# ===== 5. 기존 모델 로드 =====
model = AutoModelForSequenceClassification.from_pretrained(BASE_DIR, trust_remote_code=True)

# ===== 5-1. KoBERT 토크나이저 저장 버그 패치 =====
orig_save_vocab = getattr(tokenizer, "save_vocabulary", None)
if callable(orig_save_vocab):
    def _patched_save_vocabulary(save_directory, *args, **kwargs):
        if "filename_prefix" in kwargs:
            kwargs.pop("filename_prefix")
        return orig_save_vocab(save_directory, *args, **kwargs)
    tokenizer.save_vocabulary = _patched_save_vocabulary

# ===== 6. 이어서 학습 =====
continue_args = TrainingArguments(
    output_dir=BASE_DIR,
    logging_dir=os.path.join(BASE_DIR, "logs_continue"),
    eval_strategy="epoch",     # eval_strategy → evaluation_strategy
    logging_strategy="steps",        # step 단위로 로그
    logging_steps=10,                 # 10 step마다 로그 출력
    disable_tqdm=False,               # tqdm 진행바 표시
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    num_train_epochs=2,
    fp16=torch.cuda.is_available(),
    report_to=[]
)

continue_trainer = Trainer(
    model=model,
    args=continue_args,
    train_dataset=new_ds,
    eval_dataset=val_ds,  # 기존 검증 데이터
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

print("🚀 추가 데이터로 이어서 학습 시작")
continue_trainer.train()
print("✅ 추가 학습 완료")

# ===== 7. 모델 저장 =====
continue_trainer.save_model(BASE_DIR)
tokenizer.save_pretrained(BASE_DIR)
print("💾 모델과 토크나이저 저장 완료:", BASE_DIR)

📂 추가 데이터 로드: 150 rows
라벨 분포:
 label
0    150
Name: count, dtype: int64


Map:   0%|          | 0/150 [00:00<?, ? examples/s]

  continue_trainer = Trainer(


🚀 추가 데이터로 이어서 학습 시작


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2732,1.987557,0.524811,1.0,0.003446,0.006868,0.847923
2,0.0113,2.213041,0.523168,0.0,0.0,0.0,0.895848


✅ 추가 학습 완료
💾 모델과 토크나이저 저장 완료: /content/drive/MyDrive/model_backup/results_ctx_tgt_best
