In [16]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import random
import numpy as np
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, LongformerForSequenceClassification, LongformerTokenizer, DebertaV2ForSequenceClassification
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from peft import PeftModel, PeftConfig
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import torch.nn.functional as F
import os
from tqdm import tqdm

In [2]:
CONFIG = {
    "data_base": "../data",
}

In [4]:
train_csv = pd.read_csv(f"{CONFIG['data_base']}/final_aug_train.csv")
test_csv = pd.read_csv(f"{CONFIG['data_base']}/test.csv")

test_csv = test_csv.rename(columns={
    'paragraph_text': 'full_text'
    })

train_csv.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,paragraph_idx,full_text,generated,document_label
0,0.0,0.0,0.0,카호올라웨섬,0.0,카호올라웨섬은 하와이 제도를 구성하는 8개의 화산섬 가운데 하나로 면적은 115.5...,0,0.0
1,1.0,1.0,1.0,카호올라웨섬,1.0,마우이섬에서 남서쪽으로 약 11km 정도 떨어진 곳에 위치하며 라나이섬의 남동쪽에 ...,0,0.0
2,2.0,2.0,2.0,카호올라웨섬,2.0,1000년경부터 사람이 거주했으며 해안 지대에는 소규모 임시 어촌이 형성되었다. 섬...,0,0.0
3,3.0,3.0,3.0,카호올라웨섬,3.0,1830년대에는 하와이 왕국의 카메하메하 3세 국왕에 의해 남자 죄수들의 유형지로 ...,0,0.0
4,4.0,4.0,4.0,카호올라웨섬,4.0,1910년부터 1918년까지 하와이 준주가 섬의 원래 모습을 복원하기 위해 이 섬을...,0,0.0


In [5]:
label_0 = train_csv[train_csv['generated'] == 0]
label_1 = train_csv[train_csv['generated'] == 1]
count = min(len(label_0), len(label_1))
sampled_0 = label_0.sample(n=6*count, random_state=42)
sampled_1 = label_1.sample(n=count, random_state=42)
train_csv = pd.concat([sampled_0, sampled_1]).sample(frac=1, random_state=42).reset_index(drop=True)
print(f"✅ 샘플링 완료: 총 {len(train_csv)}개")
print(train_csv["generated"].value_counts())

✅ 샘플링 완료: 총 382508개
generated
0    327864
1     54644
Name: count, dtype: int64


In [6]:
train_df, val_df = train_test_split(
    train_csv,
    test_size=0.01,
    random_state=42,
    stratify=train_csv['generated']
)

In [7]:
class CustomDataset(Dataset):
    def __init__(self, data_df, tokenizer, mode='train'):
        self.data = data_df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['full_text']
        inputs = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=512,
            stride=256, return_overflowing_tokens=True, return_tensors="pt"
        )
        n_segments = inputs["input_ids"].size(0)
        seg_idx = random.randint(0, n_segments - 1)
        item = {k: v[seg_idx] for k, v in inputs.items() if k != "overflow_to_sample_mapping"}
        item.pop("token_type_ids", None)
        if self.mode == 'train':
            item["labels"] = int(row["generated"])
        return item

In [8]:
tokenizer = AutoTokenizer.from_pretrained('vaiv/kobigbird-roberta-large')
train_dataset = CustomDataset(train_df, tokenizer, mode='train')
val_dataset = CustomDataset(val_df, tokenizer, mode='train')
test_dataset = CustomDataset(test_csv, tokenizer, mode='eval')

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
base_model = AutoModelForSequenceClassification.from_pretrained(
    "vaiv/kobigbird-roberta-large",
    num_labels=2
)
base_model = prepare_model_for_kbit_training(base_model)
lora_config = LoraConfig(
    r=8, lora_alpha=32, target_modules=["query", "value"],
    lora_dropout=0.1, bias="none", task_type="SEQ_CLS"
)
model = get_peft_model(base_model, lora_config)
model

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at vaiv/kobigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BigBirdForSequenceClassification(
      (bert): BigBirdModel(
        (embeddings): BigBirdEmbeddings(
          (word_embeddings): Embedding(32000, 1024, padding_idx=1)
          (position_embeddings): Embedding(4096, 1024)
          (token_type_embeddings): Embedding(2, 1024)
          (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BigBirdEncoder(
          (layer): ModuleList(
            (0-23): 24 x BigBirdLayer(
              (attention): BigBirdAttention(
                (self): BigBirdBlockSparseAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'auroc': auc}


In [11]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        class_weights = torch.tensor([1.0, 6.0], device=logits.device)
        example_weights = class_weights[labels]
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        # 이 부분은 WeightedTrainer를 사용하신다면 원래 로직을 따르셔도 됩니다.
        # 위 코드는 더 간결한 가중치 적용 방식입니다.
        return (loss, outputs) if return_outputs else loss


In [12]:
training_args = TrainingArguments(
    output_dir="./",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    max_steps=300000,
    learning_rate=5e-5,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=10000,
    logging_dir="./",
    logging_steps=10000,
    fp16=True,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model, args=training_args, train_dataset=train_dataset,
    eval_dataset=val_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics,
)

trainer.train()

  trainer = WeightedTrainer(
max_steps is given, it will override any value given in num_train_epochs
Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
checkpoint_path = "./checkpoint-280000"
print(f"✅ {checkpoint_path}에서 모델을 불러옵니다.")
peft_config = PeftConfig.from_pretrained(checkpoint_path)
base_model = AutoModelForSequenceClassification.from_pretrained(peft_config.base_model_name_or_path)
model = PeftModel.from_pretrained(base_model, checkpoint_path)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

all_probs = []
valid_keys = {"input_ids", "attention_mask"}

print("✅ 추론을 시작합니다.")
with torch.no_grad():
    for i in tqdm(range(len(test_dataset)), desc="Running Inference"):
        batch = test_dataset[i]
        inputs = {k: v.unsqueeze(0).to(model.device) for k, v in batch.items() if k in valid_keys}
        
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        all_probs.extend(probs.cpu().tolist())

# --- 5. 제출 파일 생성 ---
sample_submission = pd.read_csv(f"{CONFIG['data_base']}/sample_submission.csv")
# 'AI가 생성한 글'일 확률은 두 번째 컬럼(인덱스 1)
all_AI_probs = [p[1] for p in all_probs]
sample_submission['generated'] = all_AI_probs
sample_submission.to_csv(f"submit.csv", index=False)

print("✅ 제출 파일 생성이 완료되었습니다.")

✅ ./checkpoint-280000에서 모델을 불러옵니다.


Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at vaiv/kobigbird-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ 추론을 시작합니다.


Running Inference:   0%|          | 0/1962 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 512 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Running Inference:   4%|▍         | 81/1962 [00:02<00:57, 32.73it/s]


KeyboardInterrupt: 