document 단위의 classification

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random
import numpy as np
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, LongformerForSequenceClassification, LongformerTokenizer, DebertaV2ForSequenceClassification
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments, PreTrainedTokenizer
from sklearn.model_selection import train_test_split
from collections import Counter
from transformers import BertModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
import torch.nn.functional as F
from types import SimpleNamespace
import re
from torch.optim import AdamW
from sklearn.model_selection import KFold
import os
from libauc.losses import AUCMLoss
from libauc.optimizers import PESG

In [None]:
CONFIG = {
    "data_base": "../data"
}

In [None]:
train_csv = pd.read_csv(f"{CONFIG['data_base']}/final_aug_train.csv")
test_csv = pd.read_csv(f"{CONFIG['data_base']}/test.csv")

test_csv = test_csv.rename(columns={
    'paragraph_text': 'full_text'
    })

In [None]:
# 기존 데이터에서 클래스별 분리
label_0 = train_csv[train_csv['generated'] == 0]
label_1 = train_csv[train_csv['generated'] == 1]

# 두 클래스 중 작은 수로 균형 맞추기
count = min(len(label_0), len(label_1))

# 무작위 샘플링
sampled_0 = label_0.sample(n=6*count, random_state=42)
sampled_1 = label_1.sample(n=count, random_state=42)

# 균형 잡힌 데이터셋 생성
train_csv = pd.concat([sampled_0, sampled_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# 확인
print(f"✅ 샘플링 완료: 총 {len(train_csv)}개 (각 클래스 {count}개)")
print(train_csv["generated"].value_counts())



train_csv.head()

✅ 샘플링 완료: 총 397103개 (각 클래스 56729개)
generated
0    340374
1     56729
Name: count, dtype: int64


Unnamed: 0,title,full_text,generated
0,이해찬,"국민의 정부 출범 이후 입각, 이해찬은 1998년부터 1999년까지 교육부 장관 재...",0
1,제2차 세계 대전 후 미국 통치 지역의 일본 반환,미국은 샌프란시스코 강화조약 발효 이후 어차피 이 지역에 미군 기지도 많이 두지 않...,0
2,주나라,중화문명은 주나라 시대때 많은 발전이 이루어졌다. 대표적으로 소가 끄는 쟁기와 철기...,0
3,말버러 공작부인 사라 처칠,"의지가 강했던 사라는 정치나 법, 또는 교회 임명권 등에 있어서 앤 여왕과 대립할 ...",0
4,육상산전집,왕양명은 성인의 학(學)을 심학(心學)이라고 규정하여 요순우(堯舜禹)는 서로 이어서...,0


In [17]:
# 각 라벨 개수 출력
print(train_csv['generated'].value_counts())

# 전체 개수
total = len(train_csv)

# 각 비율 계산
label_0_ratio = (train_csv['generated'] == 0).sum() / total
label_1_ratio = (train_csv['generated'] == 1).sum() / total

print(f"\n라벨 0 비율: {label_0_ratio:.4f}")
print(f"라벨 1 비율: {label_1_ratio:.4f}")


generated
0    340374
1     56729
Name: count, dtype: int64

라벨 0 비율: 0.8571
라벨 1 비율: 0.1429


In [18]:
train_df, val_df = train_test_split(
    train_csv,
    test_size=0.01,
    random_state=42,
    stratify=train_csv['generated']  # label 분포 유지
)

In [19]:
class CustomDataset(Dataset):
    def __init__(self, data_df, tokenizer, mode='train'):
        self.data = data_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['full_text']

        # 슬라이딩 윈도우 기반 tokenization
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=512,
            stride=256,  # ✅ 겹치게 자름
            return_overflowing_tokens=True,
            return_tensors="pt"
        )

        # segment 중 하나 랜덤 선택
        n_segments = inputs["input_ids"].size(0)
        seg_idx = random.randint(0, n_segments - 1)

        item = {
            k: v[seg_idx] for k, v in inputs.items() if k != "overflow_to_sample_mapping"
        }

        # Longformer는 token_type_ids 없음
        item.pop("token_type_ids", None)

        if self.mode == 'train':
            item["labels"] = int(row["generated"])  # binary classification

        return item


In [None]:
tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

train_dataset = CustomDataset(train_df, tokenizer, mode='train')
val_dataset = CustomDataset(val_df, tokenizer, mode='train')  # 선택적

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForSequenceClassification.from_pretrained(
    "klue/roberta-large",
    num_labels=2  # AI vs HUMAN → 이진 분류
)

base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],  # BERT 계열은 attention 부분 지정
    # target_modules=["query_proj", "value_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, lora_config)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(32000, 1024, padding_idx=1)
          (position_embeddings): Embedding(514, 1024, padding_idx=1)
          (token_type_embeddings): Embedding(1, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-23): 24 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A

In [22]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'auroc': auc
    }

In [23]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")  # shape: (B,)
        outputs = model(**inputs)
        logits = outputs.logits  # shape: (B, C)

        # Label Smoothing
        smoothing = 0.0
        num_classes = logits.size(1)
        smoothed_labels = F.one_hot(labels, num_classes=num_classes).float()  # (B, C)
        smoothed_labels = smoothed_labels * (1 - smoothing) + smoothing / num_classes

        probs = torch.softmax(logits, dim=-1)
        ce_loss = - (smoothed_labels * torch.log(probs + 1e-8)).sum(dim=-1)  # shape: (B,)

        # # Focal Loss (gamma=2)
        # gamma = 2.0
        # pt = probs.gather(dim=-1, index=labels.unsqueeze(-1)).squeeze(-1)  # p_t
        # # pt = (probs * smoothed_labels).sum(dim=-1)  # soft p_t
        # focal_weight = (1 - pt) ** gamma  # shape: (B,)

        # 클래스별 weight 적용 (0: 1.0, 1: 12.0)
        class_weights = torch.tensor([1.0, 6.0], device=logits.device)
        example_weights = class_weights[labels]  # shape: (B,)

        loss = (ce_loss * example_weights).mean()

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    max_steps=300000,
    learning_rate=5e-5,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=10000,
    logging_dir="./",
    logging_steps=10000,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    fp16=True,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset, 
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = WeightedTrainer(
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Auroc
10000,1.5789,0.982608,0.942095,0.77451,0.871965,0.696649,0.839808
20000,1.2332,0.993067,0.856244,0.649478,0.498117,0.932981,0.888223
30000,0.9959,0.558862,0.93429,0.79465,0.71733,0.890653,0.916105
40000,0.8433,0.509838,0.946878,0.830249,0.763314,0.910053,0.931532
50000,0.7885,0.504366,0.923212,0.778182,0.662129,0.943563,0.931693
60000,0.776,0.392949,0.943605,0.824726,0.74121,0.929453,0.937708
70000,0.7272,0.443462,0.950906,0.842361,0.777612,0.918871,0.937556
80000,0.7161,0.431325,0.95292,0.846847,0.79052,0.911817,0.935791
90000,0.6678,0.468924,0.946626,0.833071,0.752489,0.932981,0.94094
100000,0.63,0.45532,0.936052,0.809309,0.704575,0.950617,0.942122


KeyboardInterrupt: 

In [None]:
test_dataset = CustomDataset(test_csv, tokenizer, mode='eval')

# 모델 준비
checkpoint_path = "./checkpoint-200000"  # 원하는 checkpoint
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# 예측 저장 리스트
all_preds = []
all_probs = []

valid_keys = {"input_ids", "attention_mask"}

# ✅ 4. 추론 루프 (DataLoader 없이)
with torch.no_grad():
    for i in tqdm(range(len(test_dataset)), desc="Running inference without DataLoader"):
        batch = test_dataset[i]  # 단일 샘플 꺼내기
        # 모델이 받는 키만 선택
        inputs = {k: v.unsqueeze(0).to(model.device) for k, v in batch.items() if k in valid_keys}
        inputs.pop("token_type_ids", None)  # 혹시 있을 경우 제거

        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1)

        all_preds.extend(preds.cpu().tolist())
        all_probs.extend(probs.cpu().tolist())

sample_submission = pd.read_csv(f"{CONFIG['data_base']}/sample_submission.csv", encoding='utf-8-sig')
all_AI = [i[1] for i in all_probs]
sample_submission['generated'] = all_AI

sample_submission.to_csv(f"./roberta-large-aug.csv", index=False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Running inference without DataLoader: 100%|██████████| 1962/1962 [00:51<00:00, 37.76it/s]
