# KcELECTRA + KOTE + KPoEM v3 모델 구현

## 1. Basic setup and library imports
- Import Libraries and Set Configuration
  - Installs and imports required libraries (e.g., optuna, pytorch_lightning, transformers, etc.).
  - Sets random seed for reproducibility and configures model/data directories.
  - Defines constants such as number of epochs and input max length.



In [None]:
# ===================================================================
# 1. 기본 설정 및 라이브러리 임포트
# ===================================================================
!pip install -q optuna

import os
import ast
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pytorch_lightning as pl
import optuna
from datetime import datetime
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, matthews_corrcoef, precision_score, recall_score
from torchmetrics.functional.classification import multilabel_accuracy
from tqdm.auto import tqdm
from IPython.display import display

# PyTorch Lightning 로그 줄이기
import logging
logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)


# 시드 고정 및 환경 설정
RANDOM_SEED = 42
pl.seed_everything(RANDOM_SEED, workers=True)
torch.set_float32_matmul_precision('medium') # A100 등 TensorCore 사용 시 성능 향상

# 경로 설정
DATA_DIR = '../data/'        # 상황에 맞게 조정
MODEL_SAVE_DIR = './model/'  # 상황에 맞게 조정
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)

# 학습 상수 설정
N_EPOCHS_OPTUNA = 3    # Optuna 탐색 시 사용할 Epochs
N_EPOCHS_TRAIN = 10    # 본 학습 시 사용할 Epochs
THRESHOLD = 0.3        # 고정 임계값
MAX_LENGTH = 512       # 토크나이저 최대 길이

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/bitsandbytes-0.45.4.dev0-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/looseversion-1.3.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/lightning_thunder-0.2.0.dev0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/py

Seed set to 42


## 2. Data preparation (loading, preprocessing, splitting)
- Define Labels and Preprocessing Functions
  - Declares `LABELS`, a list of 44 Korean emotion categories.
  - `preprocess_by_paper_method()`: For KPoEM, aggregates labels from 5 annotators, applies min-max scaling to label agreement counts, and binarizes using a threshold.
  - `preprocess_kote()`: For KOTE, parses label indices from string to list and one-hot encodes them.
- Load and Split Datasets
  - Loads line-level and poem-level KPoEM data (`*.tsv` files), applies preprocessing, and splits into train/val/test sets.
  - Loads KOTE train/val/test splits, preprocesses them, and filters out rows with no labels.

In [None]:
# ===================================================================
# 2. 데이터 준비 (로드, 전처리, 분할) - KOTE 형식 반영 최종 수정
# ===================================================================

# 2.1. 공통 전처리 함수 및 라벨 정의
LABELS = ['불평/불만', '환영/호의', '감동/감탄', '지긋지긋', '고마움', '슬픔', '화남/분노', '존경', '기대감', '우쭐댐/무시함', '안타까움/실망', '비장함', '의심/불신', '뿌듯함', '편안/쾌적', '신기함/관심', '아껴주는', '부끄러움', '공포/무서움', '절망', '한심함', '역겨움/징그러움', '짜증', '어이없음', '없음', '패배/자기혐오', '귀찮음', '힘듦/지침', '즐거움/신남', '깨달음', '죄책감', '증오/혐오', '흐뭇함(귀여움/예쁨)', '당황/난처', '경악', '부담/안_내킴', '서러움', '재미없음', '불쌍함/연민', '놀람', '행복', '불안/걱정', '기쁨', '안심/신뢰']

def preprocess_by_paper_method(df: pd.DataFrame, threshold: float = 0.2) -> pd.DataFrame:
    """
    KPoEM 데이터에 논문의 처리 방식을 적용
    1. 모든 평가자의 라벨을 취합
    2. 각 라벨의 등장 횟수(0~5)를 '점수'로 사용
    3. 점수를 Min-Max 스케일링하고 임계값을 적용해 최종 라벨 벡터를 생성
    """

    # 1. 모든 평가자의 라벨 취합
    # 각 행(댓글)에 대해 5명 평가자의 모든 감정 라벨을 하나의 리스트로 합침
    label_lists = []
    annotator_cols = [f'annotator0{i}' for i in range(1, 6) if f'annotator0{i}' in df.columns]
    if not annotator_cols: raise ValueError("KPoEM 데이터에 annotator 컬럼이 없습니다.")
    for idx, row in df.iterrows():
        all_labels = []
        for col in annotator_cols: all_labels.extend(str(row[col]).split(','))
        label_lists.append([label.strip() for label in all_labels if label])

    # 2. '동의 횟수'를 '점수'로 변환
    # 각 행별로 44개 감정에 대해 등장 횟수를 계산하여 점수 벡터(0~5점)를 생성
    score_vectors = [[Counter(comment_labels).get(label, 0) for label in LABELS] for comment_labels in label_lists]
    scores = np.array(score_vectors, dtype=float)

    # 3. Min-Max 스케일링 및 이진화 (KOTE 논문 방식)
    min_scores, max_scores = scores.min(axis=1, keepdims=True), scores.max(axis=1, keepdims=True)
    numerator = scores - min_scores
    denominator = max_scores - min_scores
    scaled_scores = np.where(denominator != 0, numerator / denominator, 0)
    df['label_vector'] = [list(row) for row in (scaled_scores > threshold).astype(int)]
    return df

# KOTE 전용 함수
def indices_to_vector(indices_data):
    vec = [0] * len(LABELS)

    # 입력 데이터가 단일 숫자인지 확인하고, 맞으면 리스트로 감싸기
    if isinstance(indices_data, int):
        indices_list = [indices_data]
    else:
        indices_list = indices_data

    # indices_list는 항상 리스트이므로 에러 없이 반복 가능
    if indices_list: # 리스트가 비어있지 않은 경우에만 실행
        for idx in indices_list:
            if 0 <= idx < len(LABELS):
                vec[idx] = 1
    return vec

def preprocess_kote(df):
    df['label_indices'] = df['labels'].apply(ast.literal_eval)         # 'labels' 컬럼의 문자열(예: "[5, 8]")을 실제 리스트(예: [5, 8])로 변환
    df['label_vector'] = df['label_indices'].apply(indices_to_vector)  # 인덱스 리스트를 원-핫 인코딩 벡터로 변환
    df = df[df['label_vector'].apply(sum) > 0].reset_index(drop=True)
    return df


# 2.2. KPoEM 데이터 로드 및 분할
print(">>> KPoEM 데이터셋 준비 중...")
try:
    line_df = pd.read_csv(os.path.join(DATA_DIR, "KPoEM_line_dataset_v3.tsv"), sep='\t')
    poem_df = pd.read_csv(os.path.join(DATA_DIR, "KPoEM_poem_dataset_v3.tsv"), sep='\t')

    line_df.rename(columns={'본문': 'text'}, inplace=True)
    poem_df.rename(columns={'본문': 'text'}, inplace=True)

    line_df = preprocess_by_paper_method(line_df.copy(), threshold=0.2)  # 임계값 분포에 따른 수정
    poem_df = preprocess_by_paper_method(poem_df.copy(), threshold=0.2)  # 임계값 분포에 따른 수정

    line_train_val, line_test = train_test_split(line_df, test_size=0.1, random_state=RANDOM_SEED)
    line_train, line_val = train_test_split(line_train_val, test_size=1/9, random_state=RANDOM_SEED)
    poem_train_val, poem_test = train_test_split(poem_df, test_size=0.1, random_state=RANDOM_SEED)
    poem_train, poem_val = train_test_split(poem_train_val, test_size=1/9, random_state=RANDOM_SEED)

    kpoem_train_df = pd.concat([line_train, poem_train], ignore_index=True)
    kpoem_val_df = pd.concat([line_val, poem_val], ignore_index=True)
    kpoem_test_df = pd.concat([line_test, poem_test], ignore_index=True)

    print(f"KPoEM 데이터셋: Train {len(kpoem_train_df)}, Val {len(kpoem_val_df)}, Test {len(kpoem_test_df)}")
except FileNotFoundError:
    print(f"오류: KPoEM 데이터 파일을 '{os.path.join(DATA_DIR, 'KPoEM_...tsv')}' 경로에서 찾을 수 없습니다.")
    kpoem_train_df, kpoem_val_df, kpoem_test_df = None, None, None


# 2.3. KOTE 데이터 로드 및 분할 (수정)
print("\n>>> KOTE 데이터셋 준비 중 (로컬 tsv 파일 로드)...")
try:
    kote_data_path = os.path.join(DATA_DIR, 'KOTE_dataset')

    # tsv 파일에 컬럼명이 없으므로 지정
    kote_train_df = pd.read_csv(os.path.join(kote_data_path, 'train.tsv'), sep='\t', header=None, names=['text', 'labels'])
    kote_val_df = pd.read_csv(os.path.join(kote_data_path, 'val.tsv'), sep='\t', header=None, names=['text', 'labels'])
    kote_test_df = pd.read_csv(os.path.join(kote_data_path, 'test.tsv'), sep='\t', header=None, names=['text', 'labels'])

    # 각 데이터프레임에 새로 정의한 KOTE 전용 전처리 함수 적용
    kote_train_df = preprocess_kote(kote_train_df.copy())  # 수정
    kote_val_df = preprocess_kote(kote_val_df.copy())      # 수정
    kote_test_df = preprocess_kote(kote_test_df.copy())    # 수정

    print(f"KOTE 데이터셋 로드 및 전처리 완료: Train {len(kote_train_df)}, Val {len(kote_val_df)}, Test {len(kote_test_df)}")

except FileNotFoundError:
    print(f"경고: KOTE tsv 파일을 '{os.path.join(DATA_DIR, 'KOTE_dataset')}' 디렉토리에서 찾을 수 없습니다.")
    print("모델 B 학습을 건너뜁니다.")
    kote_train_df, kote_val_df, kote_test_df = None, None, None

>>> KPoEM 데이터셋 준비 중...
KPoEM 데이터셋: Train 6096, Val 763, Test 763

>>> KOTE 데이터셋 준비 중 (로컬 tsv 파일 로드)...
KOTE 데이터셋 로드 및 전처리 완료: Train 40000, Val 5000, Test 5000


## 3. Core components definition (Dataset, DataModule, LightningModule)
- Define Core Components (Model Pipeline)
  - Loads the tokenizer for KcELECTRA.
  - Defines `PoetryDataset` (PyTorch Dataset) and `PoetryDataModule` (Lightning DataModule).
  - Defines `BaseTagger`, a PyTorch Lightning module using KcELECTRA and a linear classifier with sigmoid for multi-label classification.

In [None]:
# ===================================================================
# 3. 코어 컴포넌트 정의 (Dataset, DataModule, LightningModule)
# ===================================================================

# 3.1. 토크나이저 로드
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# 3.2. Pytorch Dataset 정의
class PoetryDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.texts = df['text'].tolist()
        self.labels = df['label_vector'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.FloatTensor(self.labels[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }


# 3.3. Pytorch Lightning DataModule 정의
class PoetryDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, test_df, tokenizer, batch_size=16, max_length=MAX_LENGTH):
        super().__init__()
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length
        # Jupyter Notebook 환경에서는 num_workers를 0 또는 2로 설정하는 것이 안정적
        self.num_workers = 8 if torch.cuda.is_available() else 0  # 서버가 16코어라서 절반인 8로 설정

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            self.train_dataset = PoetryDataset(self.train_df, self.tokenizer, self.max_length)
            self.val_dataset = PoetryDataset(self.val_df, self.tokenizer, self.max_length)
        if stage == 'test' or stage is None:
            self.test_dataset = PoetryDataset(self.test_df, self.tokenizer, self.max_length)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)


# 3.4. Pytorch Lightning 모델(BaseTagger) 정의
class BaseTagger(pl.LightningModule):
    def __init__(self, model_name=MODEL_NAME, lr=2e-5, weight_decay=0.01,
                 n_training_steps=None, n_warmup_steps=None, dropout_rate=0.1):
        super().__init__()
        self.save_hyperparameters()
        self.electra = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Sequential(
            nn.Dropout(p=self.hparams.dropout_rate),
            nn.Linear(self.electra.config.hidden_size, len(LABELS))
        )
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.electra(input_ids, attention_mask=attention_mask)
        logits = self.classifier(output.last_hidden_state[:, 0, :])
        probs = torch.sigmoid(logits)

        if labels is not None:
            loss = self.criterion(probs, labels)
            return loss, probs
        return None, probs

    def training_step(self, batch, batch_idx):
        loss, _ = self(**batch)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, _ = self(**batch)
        self.log("val_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.hparams.lr,
            weight_decay=self.hparams.weight_decay
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.n_warmup_steps,
            num_training_steps=self.hparams.n_training_steps
        )
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}

## Finetuning KcELECTRA <- KPoEM
- Experiment A: Fine-Tuning KcELECTRA on KPoEM
  - Uses Optuna to search for optimal hyperparameters (batch size, LR, dropout, etc.) using the KPoEM train/val split.
  - Fine-tunes the model using the best hyperparameters and saves the best checkpoint based on validation loss.

In [None]:
# ===================================================================
# 4. KcELECTRA + KPoEM
# ===================================================================
print("="*50)
print("실험 A: KcELECTRA + KPoEM 시작")
print("="*50)

# 4.1. KPoEM 최적 하이퍼파라미터 탐색 (Optuna)
def objective_kpoem(trial):
    batch_size = trial.suggest_categorical("batch_size", [8, 16])
    lr = trial.suggest_float("lr", 1e-6, 5e-5, log=True)
    weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

    data_module = PoetryDataModule(kpoem_train_df, kpoem_val_df, kpoem_test_df, tokenizer, batch_size=batch_size)
    steps_per_epoch = len(kpoem_train_df) // batch_size
    n_training_steps = steps_per_epoch * N_EPOCHS_OPTUNA
    n_warmup_steps = int(n_training_steps * 0.1)

    model = BaseTagger(
        lr=lr, weight_decay=weight_decay, dropout_rate=dropout_rate,
        n_training_steps=n_training_steps, n_warmup_steps=n_warmup_steps
    )
    trainer = pl.Trainer(
        max_epochs=N_EPOCHS_OPTUNA, accelerator='gpu', devices=1,
        enable_checkpointing=False, logger=False, enable_progress_bar=False
    )
    trainer.fit(model, datamodule=data_module)
    return trainer.callback_metrics['val_loss'].item()

study_kpoem = optuna.create_study(direction='minimize')
study_kpoem.optimize(objective_kpoem, n_trials=15) # n_trials는 필요에 따라 조절
best_hparams_A = study_kpoem.best_params
print(f"모델 A (KPoEM) 최적 하이퍼파라미터: {best_hparams_A}")


# 4.2. 모델 A 학습 및 저장
batch_size_A = best_hparams_A['batch_size']
dm_A = PoetryDataModule(kpoem_train_df, kpoem_val_df, kpoem_test_df, tokenizer, batch_size=batch_size_A)
steps_per_epoch_A = len(kpoem_train_df) // batch_size_A
n_training_steps_A = steps_per_epoch_A * N_EPOCHS_TRAIN
n_warmup_steps_A = int(n_training_steps_A * 0.1)

model_A = BaseTagger(
    lr=best_hparams_A['lr'], weight_decay=best_hparams_A['weight_decay'], dropout_rate=best_hparams_A['dropout_rate'],
    n_training_steps=n_training_steps_A, n_warmup_steps=n_warmup_steps_A
)

checkpoint_callback_A = pl.callbacks.ModelCheckpoint(
    dirpath=os.path.join(MODEL_SAVE_DIR, "model_A"), filename='best_model_A_minmax_0.2',
    save_top_k=1, verbose=False, monitor='val_loss', mode='min'
)
trainer_A = pl.Trainer(
    max_epochs=N_EPOCHS_TRAIN, accelerator='gpu', devices=1,
    callbacks=[checkpoint_callback_A], logger=False
)

print("\n>>> 모델 A 학습 시작...")
trainer_A.fit(model_A, datamodule=dm_A)
best_model_A_path = checkpoint_callback_A.best_model_path
print(f"모델 A 학습 완료 및 저장: {best_model_A_path}")

[I 2025-07-08 19:20:26,135] A new study created in memory with name: no-name-0e7424ca-e846-46ec-b0c2-94b1472319ea


실험 A: KcELECTRA + KPoEM 시작 (minmax scaling 0.2)


[I 2025-07-08 19:22:45,495] Trial 0 finished with value: 0.43985217809677124 and parameters: {'batch_size': 16, 'lr': 8.19085401122133e-06, 'weight_decay': 0.025082516844889866, 'dropout_rate': 0.2912337705470732}. Best is trial 0 with value: 0.43985217809677124.
[I 2025-07-08 19:25:05,044] Trial 1 finished with value: 0.3913053572177887 and parameters: {'batch_size': 16, 'lr': 4.583795544332515e-05, 'weight_decay': 4.726670962642425e-05, 'dropout_rate': 0.36805277239630296}. Best is trial 1 with value: 0.3913053572177887.
[I 2025-07-08 19:27:25,413] Trial 2 finished with value: 0.4588925540447235 and parameters: {'batch_size': 16, 'lr': 4.8793873560036606e-06, 'weight_decay': 0.00022971535906732196, 'dropout_rate': 0.18124480633556128}. Best is trial 1 with value: 0.3913053572177887.
[I 2025-07-08 19:30:19,967] Trial 3 finished with value: 0.47228384017944336 and parameters: {'batch_size': 8, 'lr': 2.6018763178450856e-06, 'weight_decay': 0.0001854715651303505, 'dropout_rate': 0.238358

모델 A (KPoEM) 최적 하이퍼파라미터 (minmax scaling 0.2): {'batch_size': 16, 'lr': 4.583795544332515e-05, 'weight_decay': 4.726670962642425e-05, 'dropout_rate': 0.36805277239630296}

>>> 모델 A 학습 시작...


/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/work/KPoEM/code/model/model_A exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

모델 A 학습 완료 및 저장: /home/work/KPoEM/code/model/model_A/best_model_A_minmax_0.2.ckpt


## 5. Finetuning KcELCTRA <- KOTE <- KPoEM
- Experiment B: Sequential Fine-Tuning (KOTE → KPoEM)
  - Stage 1: Fine-tunes on KOTE dataset using Optuna hyperparameter search.
    - Saves best model and optionally resumes from `last.ckpt` if it exists.
  - Stage 2: Loads the fine-tuned KOTE model and further fine-tunes it on KPoEM using new hyperparameters via Optuna.
    - Saves the best checkpoint after fine-tuning on KPoEM.

In [None]:
# ===================================================================
# 5. KcELECTRA -> KOTE -> KPoEM
# ===================================================================
if kote_train_df is not None and 'kpoem_train_df' in locals() and kpoem_train_df is not None:
    print("\n" + "="*50)
    print("실험 B: KcELECTRA -> KOTE -> KPoEM 시작")
    print("="*50)


    # 5.1. 1차 파인튜닝 (KOTE 데이터)
    print("\n5.1. 1차 파인튜닝 (on KOTE)")
    def objective_kote(trial):
        batch_size = trial.suggest_categorical("batch_size", [8, 16])
        lr = trial.suggest_float("lr", 1e-6, 5e-5, log=True)
        weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True)
        dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

        data_module = PoetryDataModule(kote_train_df, kote_val_df, kote_test_df, tokenizer, batch_size=batch_size)
        steps_per_epoch = len(kote_train_df) // batch_size
        n_training_steps = steps_per_epoch * N_EPOCHS_OPTUNA
        n_warmup_steps = int(n_training_steps * 0.1)

        model = BaseTagger(
            lr=lr, weight_decay=weight_decay, dropout_rate=dropout_rate,
            n_training_steps=n_training_steps, n_warmup_steps=n_warmup_steps
        )
        trainer = pl.Trainer(max_epochs=N_EPOCHS_OPTUNA, accelerator='gpu', devices=1, enable_checkpointing=False, logger=False, enable_progress_bar=False)
        trainer.fit(model, datamodule=data_module)
        return trainer.callback_metrics['val_loss'].item()

    # optuna로 처음부터 찾는 경우
    study_kote = optuna.create_study(direction='minimize')
    study_kote.optimize(objective_kote, n_trials=15)
    best_hparams_kote = study_kote.best_params


    # 1차 파인튜닝 모델 학습
    batch_size_kote = best_hparams_kote['batch_size']
    dm_kote = PoetryDataModule(kote_train_df, kote_val_df, kote_test_df, tokenizer, batch_size=batch_size_kote)
    steps_per_epoch_kote = len(kote_train_df) // batch_size_kote
    n_training_steps_kote = steps_per_epoch_kote * N_EPOCHS_TRAIN
    n_warmup_steps_kote = int(n_training_steps_kote * 0.1)

    model_kote_tuned = BaseTagger(
        lr=best_hparams_kote['lr'],
        weight_decay=best_hparams_kote['weight_decay'],
        dropout_rate=best_hparams_kote['dropout_rate'],
        n_training_steps=n_training_steps_kote,
        n_warmup_steps=n_warmup_steps_kote
    )

    kote_ckpt_dir = os.path.join(MODEL_SAVE_DIR, "kote_tuned")
    checkpoint_callback_kote = pl.callbacks.ModelCheckpoint(
        dirpath=kote_ckpt_dir,
        filename='kote_finetuned-epoch{epoch:02d}-val_loss{val_loss:.2f}_minmax_0.2',  # val_loss도 파일명에 포함
        save_top_k=1,
        monitor='val_loss',
        mode='min',
        save_last=True
    )
    trainer_kote = pl.Trainer(max_epochs=N_EPOCHS_TRAIN, accelerator='gpu', devices=1, callbacks=[checkpoint_callback_kote], logger=False)

    last_ckpt_path = os.path.join(kote_ckpt_dir, "last_minmax.ckpt")
    resume_path = last_ckpt_path if os.path.exists(last_ckpt_path) else None

    if resume_path:
        print(f"기존 KOTE 학습 체크포인트({resume_path})를 발견하여 이어서 학습합니다.")
    else:
        print("\n>>> 1차(KOTE) 모델 학습 시작...")

    trainer_kote.fit(model_kote_tuned, datamodule=dm_kote, ckpt_path=resume_path)
    kote_finetuned_ckpt_path = checkpoint_callback_kote.best_model_path
    print(f"1차(KOTE) 모델 학습 완료 및 저장: {kote_finetuned_ckpt_path}")


    # 5.2. 2차 파인튜닝 (KPoEM 데이터)
    print("\n5.2. 2차 파인튜닝 (on KPoEM)")

    # objective 함수가 batch_size까지 새로 탐색하도록 수정
    ## KcELECTRA + KOTE에 맞는 새로운 hparams 탐색 필요!
    def objective_kote_kpoem(trial):
        # batch_size도 새로 탐색
        batch_size = trial.suggest_categorical("batch_size", [8, 16])
        lr = trial.suggest_float("lr", 1e-7, 2e-5, log=True)
        weight_decay = trial.suggest_float("weight_decay", 1e-5, 1e-1, log=True)

        # 이번 trial의 batch_size로 데이터 모듈과 학습 스텝 새로 계산
        data_module = PoetryDataModule(kpoem_train_df, kpoem_val_df, kpoem_test_df, tokenizer, batch_size=batch_size)
        steps_per_epoch = len(kpoem_train_df) // batch_size
        n_training_steps = steps_per_epoch * N_EPOCHS_OPTUNA
        n_warmup_steps = int(n_training_steps * 0.1)

        # 1차 학습된 모델을 불러와서 새 하이퍼파라미터로 학습
        model = BaseTagger.load_from_checkpoint(
            kote_finetuned_ckpt_path,
            lr=lr, weight_decay=weight_decay,
            n_training_steps=n_training_steps, # 새로 계산된 스텝 사용
            n_warmup_steps=n_warmup_steps      # 새로 계산된 스텝 사용
        )
        trainer = pl.Trainer(max_epochs=N_EPOCHS_OPTUNA, accelerator='gpu', devices=1, enable_checkpointing=False, logger=False, enable_progress_bar=False)

        # 새로 만든 data_module을 사용
        trainer.fit(model, datamodule=data_module)
        return trainer.callback_metrics['val_loss'].item()

    # Optuna 탐색 이어하기 기능 추가
    study_db_path_b = os.path.join(MODEL_SAVE_DIR, "study_model_B_minmax_0.2.db")
    study_name_b = "kote-kpoem-hyper-tuning-minmax-0.2"
    n_trials_b = 15

    study_kote_kpoem = optuna.create_study(
        study_name=study_name_b,
        storage=f"sqlite:///{study_db_path_b}",
        direction='minimize',
        load_if_exists=True
    )

    if len(study_kote_kpoem.trials) < n_trials_b:
        print(f"모델 B Optuna 탐색을 이어합니다. (현재 {len(study_kote_kpoem.trials)} / 목표 {n_trials_b})")
        study_kote_kpoem.optimize(objective_kote_kpoem, n_trials=(n_trials_b - len(study_kote_kpoem.trials)))
    else:
        print("모델 B Optuna 탐색이 이미 완료되었습니다.")

    best_hparams_B = study_kote_kpoem.best_params
    print(f"모델 B (KOTE->KPoEM) 최적 하이퍼파라미터: {best_hparams_B}")


    # 최종 모델 B 학습 및 저장
    batch_size_B = best_hparams_B['batch_size']
    dm_B_final = PoetryDataModule(kpoem_train_df, kpoem_val_df, kpoem_test_df, tokenizer, batch_size=batch_size_B)
    steps_per_epoch_B = len(kpoem_train_df) // batch_size_B
    n_training_steps_B = steps_per_epoch_B * N_EPOCHS_TRAIN
    n_warmup_steps_B = int(n_training_steps_B * 0.1)

    model_B_final = BaseTagger.load_from_checkpoint(
        kote_finetuned_ckpt_path,
        lr=best_hparams_B['lr'], weight_decay=best_hparams_B['weight_decay'],
        n_training_steps=n_training_steps_B,  # 새로 계산된 스텝 B 사용
        n_warmup_steps=n_warmup_steps_B       # 새로 계산된 스텝 B 사용
    )
    # 최종 학습도 이어하기 가능하도록 수정
    model_b_ckpt_dir = os.path.join(MODEL_SAVE_DIR, "model_B")
    checkpoint_callback_B = pl.callbacks.ModelCheckpoint(
        dirpath=model_b_ckpt_dir,
        filename='best_model_B_minmax_0.2',
        save_top_k=1,
        verbose=False,
        monitor='val_loss',
        mode='min',
        save_last=True
    )
    trainer_B = pl.Trainer(
        max_epochs=N_EPOCHS_TRAIN, accelerator='gpu', devices=1,
        callbacks=[checkpoint_callback_B], logger=False
    )

    last_ckpt_path_B = os.path.join(model_b_ckpt_dir, "last_minmax_0.2.ckpt")
    resume_path_B = last_ckpt_path_B if os.path.exists(last_ckpt_path_B) else None

    if resume_path_B:
        print(f"기존 모델 B 학습 체크포인트({resume_path_B})를 발견하여 이어서 학습합니다.")
    else:
        print("\n>>> 모델 B 최종 학습 시작...")

    # dm_A 대신 새로 만든 dm_B_final 사용 및 ckpt_path 추가
    trainer_B.fit(model_B_final, datamodule=dm_B_final, ckpt_path=resume_path_B)
    best_model_B_path = checkpoint_callback_B.best_model_path
    print(f"모델 B 학습 완료 및 저장: {best_model_B_path}")

[I 2025-07-08 20:42:38,955] A new study created in memory with name: no-name-c8e2290c-6c22-4f6e-9693-9641c591adcf



실험 B: KcELECTRA -> KOTE -> KPoEM 시작 (minmax scaling 0.2)

5.1. 1차 파인튜닝 (on KOTE) (minmax scaling 0.2)


[I 2025-07-08 20:57:28,541] Trial 0 finished with value: 0.35989177227020264 and parameters: {'batch_size': 16, 'lr': 1.6636523243319953e-06, 'weight_decay': 0.04049257499328559, 'dropout_rate': 0.20913140887845133}. Best is trial 0 with value: 0.35989177227020264.
[I 2025-07-08 21:15:48,461] Trial 1 finished with value: 0.32227081060409546 and parameters: {'batch_size': 8, 'lr': 2.5937953170333734e-06, 'weight_decay': 0.0006172932751761212, 'dropout_rate': 0.23870797722576376}. Best is trial 1 with value: 0.32227081060409546.
[I 2025-07-08 21:34:17,640] Trial 2 finished with value: 0.3384235203266144 and parameters: {'batch_size': 8, 'lr': 1.6242310238629837e-06, 'weight_decay': 0.011536109777098756, 'dropout_rate': 0.10892700356790126}. Best is trial 1 with value: 0.32227081060409546.
[I 2025-07-08 21:52:50,604] Trial 3 finished with value: 0.3076414465904236 and parameters: {'batch_size': 8, 'lr': 3.996845164695625e-06, 'weight_decay': 0.025304545439822922, 'dropout_rate': 0.2348506


>>> 1차(KOTE) 모델 학습 시작 (minmax scaling 0.2)...


/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/work/KPoEM/code/model/kote_tuned exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

1차(KOTE) 모델 (minmax scaling 0.2) 학습 완료 및 저장: /home/work/KPoEM/code/model/kote_tuned/kote_finetuned-epochepoch=02-val_lossval_loss=0.28_minmax_0.2.ckpt

5.2. 2차 파인튜닝 (on KPoEM) (minmax scaling 0.2)


[I 2025-07-09 01:35:08,206] A new study created in RDB with name: kote-kpoem-hyper-tuning-minmax-0.2


모델 B Optuna 탐색을 이어합니다. (현재 0 / 목표 15)


[I 2025-07-09 01:38:22,566] Trial 0 finished with value: 0.4162595272064209 and parameters: {'batch_size': 8, 'lr': 3.732677906286875e-07, 'weight_decay': 0.0033507224632032787}. Best is trial 0 with value: 0.4162595272064209.
[I 2025-07-09 01:41:01,072] Trial 1 finished with value: 0.38702791929244995 and parameters: {'batch_size': 16, 'lr': 4.816803068102869e-06, 'weight_decay': 0.0006527326958403604}. Best is trial 1 with value: 0.38702791929244995.
[I 2025-07-09 01:43:39,826] Trial 2 finished with value: 0.4549036920070648 and parameters: {'batch_size': 16, 'lr': 1.2878408603689354e-07, 'weight_decay': 0.05048351948295225}. Best is trial 1 with value: 0.38702791929244995.
[I 2025-07-09 01:46:54,138] Trial 3 finished with value: 0.4349229335784912 and parameters: {'batch_size': 8, 'lr': 1.5400178504984476e-07, 'weight_decay': 2.5241785586075894e-05}. Best is trial 1 with value: 0.38702791929244995.
[I 2025-07-09 01:49:31,739] Trial 4 finished with value: 0.4547939896583557 and param

모델 B (KOTE->KPoEM) 최적 하이퍼파라미터: {'batch_size': 16, 'lr': 1.8726765536618316e-05, 'weight_decay': 7.304905716726998e-05} (minmax scaling 0.2)

>>> 모델 B 최종 학습 시작 (minmax scaling 0.2)...


/usr/local/lib/python3.12/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/work/KPoEM/code/model/model_B exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

모델 B (minmax scaling 0.2) 학습 완료 및 저장: /home/work/KPoEM/code/model/model_B/best_model_B_minmax_0.2.ckpt


## 6. Final performance evaluation
- Final Evaluation
  - Loads all three best models:
    - A: KcELECTRA + KPoEM
    - B: KcELECTRA + KOTE → KPoEM
    - C: KcELECTRA + KOTE
  - Evaluates all models on the same KPoEM test set.
  - Computes and reports metrics: micro/macro precision, recall, F1, accuracy, and MCC.
  - Saves the results in a timestamped `.tsv` file.

In [None]:
# ===================================================================
# 6. 최종 성능 비교
# ===================================================================
import os
from sklearn.metrics import precision_score, recall_score
from torchmetrics.functional.classification import multilabel_accuracy

print("\n" + "="*50)
print("최종 성능 비교")
print("="*50)

# 저장된 모델 경로를 직접 지정

# 모델 A의 'best' 체크포인트 경로
best_model_A_path = "./model/model_A/best_model_A_minmax_0.2.ckpt"

# 모델 C (KOTE만 학습시킨 중간 모델)의 'best' 체크포인트 경로
kote_finetuned_ckpt_path = "./model/kote_tuned/kote_finetuned-epochepoch=02-val_lossval_loss=0.28_minmax_0.2.ckpt"

# 모델 B의 'best' 체크포인트 경로
best_model_B_path = "./model/model_B/best_model_B_minmax_0.2.ckpt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 평가 함수
def evaluate_model(model, test_dataset, device, threshold=THRESHOLD):
    model.to(device)
    model.eval()
    predictions_list = []
    labels_list = []

    for item in tqdm(test_dataset, desc="Testing"):
        input_ids = item["input_ids"].to(device).unsqueeze(0)
        attention_mask = item["attention_mask"].to(device).unsqueeze(0)
        labels = item["labels"].to(device)
        with torch.no_grad():
            _, preds = model(input_ids, attention_mask)
        predictions_list.append(preds.flatten().cpu())
        labels_list.append(labels.cpu())

    predictions_tensor = torch.stack(predictions_list)
    labels_tensor = torch.stack(labels_list)
    y_true = labels_tensor.numpy().astype(int)
    predictions_np = predictions_tensor.numpy()
    y_pred_bin = (predictions_np > threshold).astype(int)

    # 성능 지표
    accuracy = multilabel_accuracy(torch.from_numpy(predictions_np), torch.from_numpy(y_true), num_labels=len(LABELS), threshold=threshold, average="micro").item()
    f1_micro = f1_score(y_true, y_pred_bin, average='micro', zero_division=0)
    f1_macro = f1_score(y_true, y_pred_bin, average='macro', zero_division=0)
    mcc = matthews_corrcoef(y_true.flatten(), y_pred_bin.flatten())
    precision_micro = precision_score(y_true, y_pred_bin, average='micro', zero_division=0)
    recall_micro = recall_score(y_true, y_pred_bin, average='micro', zero_division=0)
    precision_macro = precision_score(y_true, y_pred_bin, average='macro', zero_division=0)
    recall_macro = recall_score(y_true, y_pred_bin, average='macro', zero_division=0)

    # 반환 딕셔너리에 새로운 지표 추가
    return {
        "Threshold": threshold,
        "Accuracy": accuracy,
        "Precision_micro": precision_micro, "Recall_micro": recall_micro, "F1_micro": f1_micro,
        "Precision_macro": precision_macro, "Recall_macro": recall_macro, "F1_macro": f1_macro,
        "MCC": mcc
    }

# 평가용 데이터셋 준비
if 'kpoem_test_df' in locals() and kpoem_test_df is not None:
    test_dataset = PoetryDataset(kpoem_test_df, tokenizer, MAX_LENGTH)

    # 모델 평가 로직 (기존과 동일)
    print("\n>>> 모델 A_minmax_0.2 평가 중...")
    model_A_eval = BaseTagger.load_from_checkpoint(best_model_A_path)
    results_A = evaluate_model(model_A_eval, test_dataset, device)
    df_A = pd.DataFrame([results_A])
    df_A["Model"] = "A (KcELECTRA + KPoEM)"
    df_list = [df_A]

    if 'best_model_B_path' in locals() and best_model_B_path:
        print("\n>>> 모델 C_minmax_0.2 (KcELECTRA + KOTE) 평가 중...")
        model_C_eval = BaseTagger.load_from_checkpoint(kote_finetuned_ckpt_path)
        results_C = evaluate_model(model_C_eval, test_dataset, device)
        df_C = pd.DataFrame([results_C])
        df_C["Model"] = "C (KcELECTRA + KOTE)"
        df_list.append(df_C)

        print("\n>>> 모델 B_minmax_0.2 (KOTE -> KPoEM) 평가 중...")
        model_B_eval = BaseTagger.load_from_checkpoint(best_model_B_path)
        results_B = evaluate_model(model_B_eval, test_dataset, device)
        df_B = pd.DataFrame([results_B])
        df_B["Model"] = "B (KOTE -> KPoEM)"
        df_list.append(df_B)

    final_results_df = pd.concat(df_list, ignore_index=True)

    # 최종 결과 출력 및 저장
    print("\n최종 비교 결과")

    # display 할 컬럼 목록
    display_cols = [
        "Model", "Threshold",
        "Accuracy",
        "Precision_micro", "Precision_macro",
        "Recall_micro", "Recall_macro",
        "F1_micro", "F1_macro",
        "MCC"
    ]
    display(final_results_df[display_cols])

    timestamp = datetime.now().strftime("%y%m%d_%H%M%S")
    save_path = os.path.join(MODEL_SAVE_DIR, f"final_minmax_0.2_comparison_{timestamp}.tsv")
    final_results_df.to_csv(save_path, sep='\t', index=False)
    print(f"\n결과 파일 저장 완료: {save_path}")

else:
    print("KPoEM 테스트 데이터셋이 준비되지 않아 최종 평가를 건너뜁니다.")


최종 성능 비교

>>> 모델 A_minmax_0.2 평가 중...


Testing:   0%|          | 0/763 [00:00<?, ?it/s]


>>> 모델 C_minmax_0.2 (KcELECTRA + KOTE) 평가 중...


Testing:   0%|          | 0/763 [00:00<?, ?it/s]


>>> 모델 B_minmax_0.2 (KOTE -> KPoEM) 평가 중...


Testing:   0%|          | 0/763 [00:00<?, ?it/s]


최종 비교 결과


Unnamed: 0,Model,Threshold,Accuracy,Precision_micro,Precision_macro,Recall_micro,Recall_macro,F1_micro,F1_macro,MCC
0,A (KcELECTRA + KPoEM),0.3,0.799089,0.545235,0.486193,0.655391,0.481596,0.59526,0.45258,0.4666
1,C (KcELECTRA + KOTE),0.3,0.769957,0.48692,0.461387,0.38121,0.330282,0.427629,0.343058,0.2897
2,B (KOTE -> KPoEM),0.3,0.797629,0.540987,0.484663,0.674947,0.528997,0.600588,0.482592,0.472393



결과 파일 저장 완료: ./model/final_minmax_0.2_comparison_250709_195408.tsv
