In [None]:
import os, json, random, numpy as np, torch
os.environ["HF_ALLOW_CODE_EXECUTION"] = "1"
os.environ["TRANSFORMERS_ALLOW_CODE_EXECUTION"] = "1"
cfg_path = os.path.expanduser("~/.huggingface/config.json")
os.makedirs(os.path.dirname(cfg_path), exist_ok=True)
try:
    with open(cfg_path, "w", encoding="utf-8") as f:
        json.dump({"transformers": {"allow_code_execution": True}}, f)
except Exception:
    pass

def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [None]:
!pip -q install optuna
from typing import List, Dict
import re, pandas as pd, optuna
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/395.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m389.1/395.9 kB[0m [31m12.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/247.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
DATA_PATH = "/content/drive/MyDrive/metrics/혐오조롱표현탐지용데이터 5400개.jsonl"

In [None]:
def load_jsonl(path: str):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for lineno, line in enumerate(f, start=1):
            s = line.strip()
            if not s:
                continue
            try:
                rows.append(json.loads(s))
            except json.JSONDecodeError as e:
                print(f"[WARN] JSON decode error at line {lineno}: {e}")
    return rows

raw = load_jsonl(DATA_PATH)
print(f"✅ 로드 완료: {len(raw)} 행")

✅ 로드 완료: 5200 행


In [None]:
def map_label(lbl: str) -> int:
    if not lbl: return 0
    return 1 if lbl.strip().lower() == "toxic" else 0

In [None]:
SEP = " [SEP] "
def build_input(utterances: List[str], k: int, use_prof_feature: bool = False) -> str:
    n = len(utterances)
    target = utterances[k]
    before = SEP.join(utterances[:k]) if k > 0 else ""
    after  = SEP.join(utterances[k+1:]) if (k + 1) < n else ""

    parts = []
    if before:
        parts.append("[CTX] " + before)
    parts.append("[TGT] " + target + " [/TGT]")
    if after:
        parts.append("[CTX] " + after)
    return "\n".join(parts)

records = []
bad = 0
for r in raw:
    utts = r.get("utterances", [])
    idx = r.get("target_index", None)
    lbl = r.get("label", None)
    if not isinstance(utts, list) or idx is None or idx < 0 or idx >= len(utts):
        bad += 1; continue
    records.append({
        "dialogue_id": r.get("dialogue_id", ""),
        "text": build_input(utts, idx, use_prof_feature=False),
        "label": map_label(lbl)
    })
if bad: print(f"⚠️ 무시된 레코드: {bad}")

df = pd.DataFrame(records)
print(df.head(2))
print("라벨 분포:\n", df["label"].value_counts())

  dialogue_id                                               text  label
0         001  [CTX] 부랴부랴 왔는데 아무도 안왔네. 시간개념들이 없네 [SEP] 맞아. 사람...      1
1         002  [CTX] 인방 보는 남자는 거르는게 맞다 [SEP] 특히 벗방보는 애들은 진짜 거...      0
라벨 분포:
 label
0    2946
1    2254
Name: count, dtype: int64


In [None]:
def load_tok_and_model():
    try:
        tok = AutoTokenizer.from_pretrained("monologg/kobert", trust_remote_code=True)
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "monologg/kobert", trust_remote_code=True, num_labels=2
        )
        print("✅ KoBERT 로드")
    except Exception as e:
        print("❌ KoBERT 실패 → mBERT로 대체:", e)
        tok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "bert-base-multilingual-cased", num_labels=2
        )
        print("✅ mBERT 로드")
    return tok, mdl

tokenizer, base_model = load_tok_and_model()

# KoBERT 저장 호환 패치
orig_save_vocabulary = getattr(tokenizer, "save_vocabulary", None)
if callable(orig_save_vocabulary):
    def _patched_save_vocabulary(save_directory, *args, **kwargs):
        return orig_save_vocabulary(save_directory)
    tokenizer.save_vocabulary = _patched_save_vocabulary

base_model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ KoBERT 로드


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [None]:
MAX_LEN = 320  # HPO에서는 고정(시퀀스 길이는 재토크나이즈가 필요해 비권장)
def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=MAX_LEN)

train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

train_ds = HFDataset.from_pandas(train_df.reset_index(drop=True))
val_ds   = HFDataset.from_pandas(val_df.reset_index(drop=True))
train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=["text", "dialogue_id"])
val_ds   = val_ds.map(tokenize_fn,   batched=True, remove_columns=["text", "dialogue_id"])
train_ds = train_ds.rename_column("label", "labels")
val_ds   = val_ds.rename_column("label", "labels")
train_ds.set_format(type="torch"); val_ds.set_format(type="torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/4160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1040 [00:00<?, ? examples/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    out = {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, zero_division=0),
        "recall": recall_score(labels, preds, zero_division=0),
        "f1": f1_score(labels, preds, zero_division=0),
    }
    try:
        prob_pos = torch.softmax(torch.tensor(logits), dim=-1)[:, 1].numpy()
        out["roc_auc"] = roc_auc_score(labels, prob_pos)
    except Exception:
        out["roc_auc"] = float("nan")
    return out

In [None]:
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [8, 16, 32]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.05),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.2),
        "lr_scheduler_type": trial.suggest_categorical(
            "lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial"]
        ),
        "num_train_epochs": 2,  # 탐색은 가볍게
    }

In [None]:
def compute_objective(metrics):  # 최대화 목표
    return metrics["eval_f1"]

In [None]:
def model_init():
    # 각 trial마다 새 모델 (같은 아키텍처 재초기화)
    m = AutoModelForSequenceClassification.from_pretrained(
        "monologg/kobert", trust_remote_code=True, num_labels=2
    )
    return m

args_hpo = TrainingArguments(
    output_dir="./hpo_tmp",
    logging_dir="./hpo_tmp/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_steps=1,
    logging_first_step=True,
    report_to=[],
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=False,
    metric_for_best_model="f1",
    greater_is_better=True,
    disable_tqdm=False,
)

trainer_hpo = Trainer(
    model_init=model_init,
    args=args_hpo,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer_hpo = Trainer(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print("🔎 HPO 시작 (n_trials=20)")
best_run = trainer_hpo.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    compute_objective=compute_objective,
    n_trials=20,
    backend="optuna"
)
print("✅ Best trial:", best_run)
print("➡️ Best params:", best_run.hyperparameters)


[I 2025-08-13 06:58:06,550] A new study created in memory with name: no-name-3598d132-29f4-4ed9-8633-47300f0a374b
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔎 HPO 시작 (n_trials=20)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.2724,0.184468,0.935577,0.870656,1.0,0.930857,0.981831
2,0.1413,0.141703,0.950962,0.914938,0.977827,0.945338,0.985497


[I 2025-08-13 06:58:39,479] Trial 0 finished with value: 0.9453376205787781 and parameters: {'learning_rate': 6.456603442172581e-05, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 2, 'weight_decay': 0.015790742975545096, 'warmup_ratio': 0.030977973524302805, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 0 with value: 0.9453376205787781.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.6229,0.688558,0.566346,0.0,0.0,0.0,0.505827
2,0.696,0.684475,0.566346,0.0,0.0,0.0,0.510106


[I 2025-08-13 06:59:14,461] Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 0.00036171387440645046, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.0026924270846941213, 'warmup_ratio': 0.15443344903804587, 'lr_scheduler_type': 'linear'}. Best is trial 0 with value: 0.9453376205787781.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3288,0.19397,0.945192,0.90535,0.97561,0.939168,0.970957
2,0.1916,0.159057,0.952885,0.910204,0.988914,0.947928,0.971804


[I 2025-08-13 07:00:09,388] Trial 2 finished with value: 0.9479277364505845 and parameters: {'learning_rate': 0.00013960832795509835, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 4, 'weight_decay': 0.019846607846760223, 'warmup_ratio': 0.11385657098516477, 'lr_scheduler_type': 'linear'}. Best is trial 2 with value: 0.9479277364505845.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3064,0.163696,0.952885,0.905242,0.995565,0.948258,0.974012
2,0.1677,0.144425,0.953846,0.908722,0.993348,0.949153,0.980374


[I 2025-08-13 07:00:32,263] Trial 3 finished with value: 0.9491525423728814 and parameters: {'learning_rate': 1.8066216260075222e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.017515971750933928, 'warmup_ratio': 0.018798604786228523, 'lr_scheduler_type': 'polynomial'}. Best is trial 3 with value: 0.9491525423728814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.6572,0.688273,0.566346,0.0,0.0,0.0,0.5829
2,0.6939,0.685354,0.566346,0.0,0.0,0.0,0.611817


[I 2025-08-13 07:01:37,438] Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 0.00023147169359311382, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.022720502145482126, 'warmup_ratio': 0.10697183713977375, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 3 with value: 0.9491525423728814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.6099,0.696232,0.433654,0.433654,1.0,0.604963,0.422058


[I 2025-08-13 07:02:03,344] Trial 5 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.678,0.692292,0.4375,0.435328,1.0,0.60659,0.922193
2,0.697,0.684686,0.566346,0.0,0.0,0.0,0.58584


[I 2025-08-13 07:03:08,211] Trial 6 finished with value: 0.0 and parameters: {'learning_rate': 0.00034095415290848067, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.019779108701849936, 'warmup_ratio': 0.1556521493749824, 'lr_scheduler_type': 'polynomial'}. Best is trial 3 with value: 0.9491525423728814.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3011,0.193222,0.9375,0.874031,1.0,0.932782,0.980549
2,0.1697,0.155555,0.957692,0.917864,0.991131,0.953092,0.98447


[I 2025-08-13 07:04:10,219] Trial 7 finished with value: 0.9530916844349681 and parameters: {'learning_rate': 3.371231230528883e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.028357515375541816, 'warmup_ratio': 0.10904626443293292, 'lr_scheduler_type': 'linear'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.476,0.484088,0.853846,0.874687,0.773836,0.821176,0.856128
2,0.44,0.415385,0.853846,0.874687,0.773836,0.821176,0.844473


[I 2025-08-13 07:05:15,138] Trial 8 finished with value: 0.8211764705882353 and parameters: {'learning_rate': 0.0002821978534124849, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.03689766552256707, 'warmup_ratio': 0.10685811187878776, 'lr_scheduler_type': 'cosine'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3145,0.222168,0.938462,0.90566,0.957871,0.931034,0.976557
2,0.1693,0.143074,0.950962,0.909836,0.984479,0.945687,0.979679


[I 2025-08-13 07:05:36,738] Trial 9 finished with value: 0.9456869009584664 and parameters: {'learning_rate': 9.299420349578542e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 2, 'weight_decay': 0.04784454975986293, 'warmup_ratio': 0.13036066165982632, 'lr_scheduler_type': 'cosine_with_restarts'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3605,0.184566,0.935577,0.912017,0.94235,0.926936,0.974712


[I 2025-08-13 07:05:51,933] Trial 10 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3514,0.188399,0.948077,0.896208,0.995565,0.943277,0.967439
2,0.1878,0.164067,0.952885,0.903614,0.997783,0.948367,0.974546


[I 2025-08-13 07:06:14,913] Trial 11 finished with value: 0.9483667017913593 and parameters: {'learning_rate': 1.0671851380287005e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.02955851045847388, 'warmup_ratio': 0.0055462790409018525, 'lr_scheduler_type': 'polynomial'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3062,0.181126,0.943269,0.913502,0.960089,0.936216,0.977236


[I 2025-08-13 07:06:25,061] Trial 12 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3191,0.178023,0.945192,0.887795,1.0,0.940563,0.976114


[I 2025-08-13 07:06:35,059] Trial 13 pruned. 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.4138,0.201495,0.945192,0.894,0.991131,0.940063,0.966214
2,0.1897,0.16458,0.95,0.90303,0.991131,0.945032,0.970705


[I 2025-08-13 07:06:56,250] Trial 14 finished with value: 0.945031712473573 and parameters: {'learning_rate': 4.091488397527725e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 4, 'weight_decay': 0.04286827415881622, 'warmup_ratio': 0.030374957672054226, 'lr_scheduler_type': 'polynomial'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3096,0.233646,0.924038,0.850943,1.0,0.91947,0.974763
2,0.1868,0.169828,0.951923,0.905051,0.993348,0.947146,0.97899


[I 2025-08-13 07:07:58,907] Trial 15 finished with value: 0.9471458773784355 and parameters: {'learning_rate': 1.0627587115877893e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.028145722365509745, 'warmup_ratio': 0.0905156257741506, 'lr_scheduler_type': 'linear'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3043,0.169548,0.950962,0.901606,0.995565,0.946259,0.969858
2,0.1742,0.15542,0.953846,0.905433,0.997783,0.949367,0.976628


[I 2025-08-13 07:08:21,699] Trial 16 finished with value: 0.9493670886075949 and parameters: {'learning_rate': 1.5379143973514842e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.0361629662567273, 'warmup_ratio': 0.0004946233867199751, 'lr_scheduler_type': 'polynomial'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3195,0.171146,0.943269,0.885827,0.997783,0.938478,0.973387
2,0.1781,0.163205,0.953846,0.913758,0.986696,0.948827,0.977187


[I 2025-08-13 07:09:24,828] Trial 17 finished with value: 0.9488272921108742 and parameters: {'learning_rate': 5.150036651993096e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 16, 'gradient_accumulation_steps': 1, 'weight_decay': 0.03851189210778395, 'warmup_ratio': 0.1441542230467301, 'lr_scheduler_type': 'cosine'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.4605,0.226962,0.944231,0.89697,0.984479,0.938689,0.967064
2,0.2126,0.178286,0.950962,0.904858,0.991131,0.946032,0.972747


[I 2025-08-13 07:09:45,626] Trial 18 finished with value: 0.946031746031746 and parameters: {'learning_rate': 1.6350311192495442e-05, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 2, 'weight_decay': 0.0305409740457476, 'warmup_ratio': 0.05133250986092022, 'lr_scheduler_type': 'linear'}. Best is trial 7 with value: 0.9530916844349681.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3661,0.223445,0.920192,0.844569,1.0,0.915736,0.967601


[I 2025-08-13 07:10:01,631] Trial 19 pruned. 


✅ Best trial: BestRun(run_id='7', objective=0.9530916844349681, hyperparameters={'learning_rate': 3.371231230528883e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.028357515375541816, 'warmup_ratio': 0.10904626443293292, 'lr_scheduler_type': 'linear'}, run_summary=None)
➡️ Best params: {'learning_rate': 3.371231230528883e-05, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 32, 'gradient_accumulation_steps': 1, 'weight_decay': 0.028357515375541816, 'warmup_ratio': 0.10904626443293292, 'lr_scheduler_type': 'linear'}


In [None]:
best = best_run.hyperparameters

pos = int((train_df["label"] == 1).sum())
neg = int((train_df["label"] == 0).sum())
w_neg = 1.0
w_pos = float(neg / pos) if pos > 0 else 1.0
class_weights = torch.tensor([w_neg, w_pos], dtype=torch.float32)
if torch.cuda.is_available():
    class_weights = class_weights.cuda()

def custom_compute_loss(model, inputs, return_outputs=False, **kwargs):
    labels = inputs.get("labels")
    outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
    logits = outputs.logits
    loss = torch.nn.CrossEntropyLoss(weight=class_weights)(logits.view(-1, 2), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

args_final = TrainingArguments(
    output_dir="./results_ctx_tgt",
    logging_dir="./results_ctx_tgt/logs",
    num_train_epochs=4,  # 최종은 더 길게(필요시 조절)
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=[],
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    **best,  # HPO로 찾은 최적값 주입 (lr, batch, gaccum, wd, warmup, scheduler 등)
)


In [None]:
final_model = AutoModelForSequenceClassification.from_pretrained(
    "monologg/kobert", trust_remote_code=True, num_labels=2
).to(device)

trainer_final = Trainer(
    model=final_model,
    args=args_final,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
trainer_final.compute_loss = custom_compute_loss

print("🚀 최종 재학습 시작")
trainer_final.train()
print("✅ 재학습 완료")

eval_out = trainer_final.evaluate()
print("📊 최종 검증 지표:", eval_out)

# 저장
trainer_final.save_model("./results_ctx_tgt/best")
tokenizer.save_pretrained("./results_ctx_tgt/best")
print("💾 저장 완료: ./results_ctx_tgt/best")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_final = Trainer(


🚀 최종 재학습 시작


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Roc Auc
1,0.3086,0.154942,0.951923,0.901804,0.997783,0.947368,0.981699
2,0.2008,0.189484,0.948077,0.894632,0.997783,0.943396,0.981616
3,0.1523,0.139709,0.955769,0.91411,0.991131,0.951064,0.985367
4,0.0965,0.25944,0.940385,0.927473,0.935698,0.931567,0.984968


✅ 재학습 완료


📊 최종 검증 지표: {'eval_loss': 0.13970865309238434, 'eval_accuracy': 0.9557692307692308, 'eval_precision': 0.9141104294478528, 'eval_recall': 0.991130820399113, 'eval_f1': 0.951063829787234, 'eval_roc_auc': 0.985367359461525, 'eval_runtime': 0.7863, 'eval_samples_per_second': 1322.71, 'eval_steps_per_second': 41.971, 'epoch': 4.0}
💾 저장 완료: ./results_ctx_tgt/best


In [None]:
import shutil

# Google Drive 경로 (미리 Drive 마운트 되어 있어야 함)
drive_path = "/content/drive/MyDrive/model_backup/results_ctx_tgt_best"

# 모델 폴더 복사
shutil.copytree("./results_ctx_tgt/best", drive_path, dirs_exist_ok=True)

print(f"📦 모델을 Google Drive로 복사 완료: {drive_path}")

📦 모델을 Google Drive로 복사 완료: /content/drive/MyDrive/model_backup/results_ctx_tgt_best


In [None]:
import torch

SEP = " [SEP] "

def build_input_from_lines(lines):
    """
    lines: 사용자가 입력한 문장 리스트 (A → B → A → ... → B)
    마지막 줄을 [TGT]로 감싸고, 앞뒤 문맥은 [CTX]로 구성
    """
    assert len(lines) >= 2, "최소 2줄 이상 입력하세요."
    k = len(lines) - 1
    before = SEP.join(lines[:k])
    target = lines[k]
    parts = []
    if before:
        parts.append("[CTX] " + before)
    parts.append("[TGT] " + target + " [/TGT]")
    return "\n".join(parts), target

@torch.no_grad()
def predict_last_line(
    lines,
    model,
    tokenizer,
    max_len=320,
    device=None,
    return_probs=True
):
    """
    lines의 마지막 문장을 타깃으로 판정.
    return: dict(label, prob_toxic, prob_non_toxic, target_text, input_text)
      - label: 0=비유해, 1=유해(toxic)
    """
    if device is None:
        device = next(model.parameters()).device
    model.eval()

    input_text, target_text = build_input_from_lines(lines)
    enc = tokenizer(
        input_text,
        truncation=True,
        padding=True,
        max_length=max_len,
        return_tensors="pt"
    )
    enc = {k: v.to(device) for k, v in enc.items()}

    out = model(**enc)
    logits = out.logits
    probs = torch.softmax(logits, dim=-1)[0].tolist()
    pred = int(torch.argmax(logits, dim=-1).item())

    res = {
        "label": pred,  # 0=비유해, 1=유해
        "prob_non_toxic": float(probs[0]),
        "prob_toxic": float(probs[1]),
        "target_text": target_text,
        "input_text": input_text
    }
    if not return_probs:
        del res["prob_non_toxic"]; del res["prob_toxic"]
    return res

def interactive_predict_last_line(model, tokenizer, max_len=320):
    """
    콘솔에서 한 줄씩 입력 → 빈 줄로 종료 → 마지막 줄 판정
    """
    print("채팅을 한 줄씩 입력하세요. (빈 줄 입력 시 종료)\n예: A: ... → B: ... → A: ... → B: ...")
    lines = []
    while True:
        line = input()
        if not line.strip():
            break
        lines.append(line.strip())

    if len(lines) < 2:
        raise ValueError("최소 2줄 이상 입력해야 합니다.")

    res = predict_last_line(lines, model, tokenizer, max_len=max_len)
    print("\n🎯 판정 결과")
    print(f"- 타깃 문장: {res['target_text']}")
    print(f"- 라벨: {'유해(1)' if res['label']==1 else '비유해(0)'}")
    print(f"- prob_toxic: {res['prob_toxic']:.3f} | prob_non_toxic: {res['prob_non_toxic']:.3f}")
    return res


In [None]:
# 1) 환경변수(현재 세션 즉시 반영)
import os, json
os.environ["HF_ALLOW_CODE_EXECUTION"] = "1"            # 과거 키
os.environ["TRANSFORMERS_ALLOW_CODE_EXECUTION"] = "1"  # 현재 키

# 2) 전역 설정 파일을 수동 생성 (다음 세션부터도 자동 허용)
cfg_path = os.path.expanduser("~/.huggingface/config.json")
os.makedirs(os.path.dirname(cfg_path), exist_ok=True)
cfg = {"transformers": {"allow_code_execution": True}}
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(cfg, f)
print("Wrote:", cfg_path)
print(open(cfg_path, "r", encoding="utf-8").read())

Wrote: /root/.huggingface/config.json
{"transformers": {"allow_code_execution": true}}


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

SAVE_DIR = "./results_ctx_tgt/best"

tokenizer = AutoTokenizer.from_pretrained(
    SAVE_DIR, trust_remote_code=True, local_files_only=True
)
model = AutoModelForSequenceClassification.from_pretrained(
    SAVE_DIR, trust_remote_code=True, local_files_only=True
)
model.to("cuda" if torch.cuda.is_available() else "cpu")

interactive_predict_last_line(model, tokenizer)

채팅을 한 줄씩 입력하세요. (빈 줄 입력 시 종료)
예: A: ... → B: ... → A: ... → B: ...
ㅋㅋㅋㅋㅋ개웃기다
이거 머임ㅋㅋㅋㅋㅋ 나도 보내줘


🎯 판정 결과
- 타깃 문장: 이거 머임ㅋㅋㅋㅋㅋ 나도 보내줘
- 라벨: 유해(1)
- prob_toxic: 0.905 | prob_non_toxic: 0.095


{'label': 1,
 'prob_non_toxic': 0.09483423084020615,
 'prob_toxic': 0.9051657319068909,
 'target_text': '이거 머임ㅋㅋㅋㅋㅋ 나도 보내줘',
 'input_text': '[CTX] ㅋㅋㅋㅋㅋ개웃기다\n[TGT] 이거 머임ㅋㅋㅋㅋㅋ 나도 보내줘 [/TGT]'}