# KorNLI - Natural Language Inference Baseline

**Self-contained Colab notebook** - 모든 코드가 노트북에 포함되어 있어 Colab에서 바로 실행 가능합니다.

## 사용법
1. **런타임 설정**: 런타임 → 런타임 유형 변경 → GPU 선택
2. **Setup 실행**: Cell 1 실행 (패키지 설치 및 GPU 확인)
3. **데이터 로드**: Cell 3 또는 Cell 4 중 **하나만** 선택하여 실행
4. **경로 설정**: Cell 5 실행
5. **코드 정의**: Cell 6~11 순서대로 실행
6. **학습 및 평가**: Cell 12~14 순서대로 실행

## 예상 학습 시간
- **T4 GPU 기준**: ~5분 (1 epoch, default 설정)
- **CPU**: ~30분 이상

In [None]:
# =============================================================================
# Cell 1: Setup - 패키지 설치 및 GPU 확인
# =============================================================================
!pip install -q torch transformers pandas scikit-learn tqdm sentencepiece

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")

## 데이터 로드

아래 **Option A** 또는 **Option B** 중 **하나만** 선택하여 실행하세요.

- **Option A**: 로컬 파일 업로드 (파일을 직접 업로드)
- **Option B**: Google Drive 연결 (Drive에 데이터가 있는 경우)

In [None]:
# =============================================================================
# Cell 3: Data Option A - 파일 업로드 (선택 실행)
# =============================================================================
# 이 셀은 파일 업로드 방식을 원할 때만 실행하세요.

from google.colab import files
import os

os.makedirs("data", exist_ok=True)
print("train.tsv, val.tsv, test_unlabeled.tsv 파일을 업로드하세요")
uploaded = files.upload()

for filename in uploaded.keys():
    os.rename(filename, f"data/{filename}")
    print(f"  -> data/{filename} 저장 완료")

DATA_DIR = "data"
print(f"\nDATA_DIR = '{DATA_DIR}'")

In [None]:
# =============================================================================
# Cell 4: Data Option B - Google Drive (선택 실행)
# =============================================================================
# 이 셀은 Google Drive 방식을 원할 때만 실행하세요.

from google.colab import drive
drive.mount('/content/drive')

# 아래 경로를 본인의 Drive 폴더로 수정하세요
DATA_DIR = "/content/drive/MyDrive/kor-nlu-datasets/data"
print(f"DATA_DIR = '{DATA_DIR}'")

In [None]:
# =============================================================================
# Cell 5: 경로 설정 (필수 실행)
# =============================================================================
# DATA_DIR은 위 Option A 또는 B에서 설정되어야 합니다.

OUTPUT_DIR = "."
CHECKPOINT_DIR = "./checkpoints"

# 설정된 경로 확인
import os
print(f"Data directory: {DATA_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Checkpoint directory: {CHECKPOINT_DIR}")

# 데이터 파일 확인
if os.path.exists(DATA_DIR):
    print(f"\nFiles in {DATA_DIR}:")
    for f in os.listdir(DATA_DIR):
        print(f"  - {f}")
else:
    print(f"\n[WARNING] {DATA_DIR} 디렉토리가 존재하지 않습니다!")

In [None]:
# =============================================================================
# Cell 6: Config 정의
# =============================================================================

import torch

# Label Mapping
LABEL2ID = {"entailment": 0, "neutral": 1, "contradiction": 2}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}
NUM_LABELS = len(LABEL2ID)

# Device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Configuration Presets
CONFIGS = {
    "default": {
        "model_name": "monologg/distilkobert",
        "max_length": 64,
        "batch_size": 64,
        "learning_rate": 5e-4,
        "epochs": 1,
        "warmup_ratio": 0.0,
        "weight_decay": 0.0,
        "classifier_dropout": None,
        "early_stopping_patience": None,
        "preprocess": False,
        "train_file": "train.tsv",
        "dev_file": "val.tsv",
        "test_file": "test_unlabeled.tsv",
    },
}


def get_config(preset: str = "default", **overrides) -> dict:
    """설정 프리셋 로드 + override 적용"""
    if preset not in CONFIGS:
        raise ValueError(f"Unknown preset: {preset}. Available: {list(CONFIGS.keys())}")
    config = CONFIGS[preset].copy()
    config.update(overrides)
    return config


print(f"Device: {DEVICE}")
print(f"Labels: {LABEL2ID}")

In [None]:
# =============================================================================
# Cell 7: Preprocessing 정의
# =============================================================================

import re
import unicodedata


def preprocess_text(text: str) -> str:
    """기본 텍스트 전처리 (Unicode NFC 정규화 + 공백 정규화)"""
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


print("Preprocessing function defined.")

In [None]:
# =============================================================================
# Cell 8: Dataset 정의
# =============================================================================

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizer


def load_data(file_path: str, preprocess: bool = False) -> pd.DataFrame:
    """TSV 데이터 파일 로드"""
    df = pd.read_csv(file_path, sep="\t", quoting=3, on_bad_lines="skip")
    # 결측값 제거
    df = df.dropna(subset=["sentence1", "sentence2"])
    if preprocess:
        df["sentence1"] = df["sentence1"].apply(preprocess_text)
        df["sentence2"] = df["sentence2"].apply(preprocess_text)
    return df


class NLIDataset(Dataset):
    """NLI 태스크용 PyTorch Dataset"""

    def __init__(self, df: pd.DataFrame, tokenizer: PreTrainedTokenizer, max_length: int, label2id: dict = None):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = label2id

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> dict:
        row = self.df.iloc[idx]
        encoding = self.tokenizer(
            row["sentence1"],
            row["sentence2"],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
        }
        if self.label2id and "gold_label" in row:
            item["labels"] = torch.tensor(self.label2id[row["gold_label"]])
        return item


def create_dataloaders(
    train_df: pd.DataFrame,
    dev_df: pd.DataFrame,
    tokenizer: PreTrainedTokenizer,
    max_length: int,
    batch_size: int,
    test_df: pd.DataFrame = None,
) -> tuple:
    """DataLoader 일괄 생성"""
    train_dataset = NLIDataset(train_df, tokenizer, max_length, LABEL2ID)
    dev_dataset = NLIDataset(dev_df, tokenizer, max_length, LABEL2ID)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

    if test_df is not None:
        test_dataset = NLIDataset(test_df, tokenizer, max_length, LABEL2ID)
        test_loader = DataLoader(test_dataset, batch_size=batch_size)
        return train_loader, dev_loader, test_loader

    return train_loader, dev_loader


print("Dataset classes defined.")

In [None]:
# =============================================================================
# Cell 9: Model 정의
# =============================================================================

import os
import torch
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


def load_tokenizer(model_name: str) -> PreTrainedTokenizer:
    """토크나이저 로드"""
    return AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


def load_model(model_name: str, classifier_dropout: float = None, device: torch.device = None) -> PreTrainedModel:
    """분류 모델 로드"""
    if device is None:
        device = DEVICE

    model_config = AutoConfig.from_pretrained(model_name, num_labels=NUM_LABELS, trust_remote_code=True)

    if classifier_dropout is not None:
        if hasattr(model_config, "classifier_dropout"):
            model_config.classifier_dropout = classifier_dropout
        elif hasattr(model_config, "hidden_dropout_prob"):
            model_config.hidden_dropout_prob = classifier_dropout

    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config, trust_remote_code=True)
    model.to(device)
    return model


def create_optimizer(model: PreTrainedModel, learning_rate: float, weight_decay: float = 0.0) -> torch.optim.Optimizer:
    """AdamW 옵티마이저 생성"""
    return torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


def create_scheduler(optimizer: torch.optim.Optimizer, num_training_steps: int, warmup_ratio: float = 0.0):
    """Linear warmup 스케줄러 생성 (warmup_ratio > 0일 때만)"""
    if warmup_ratio <= 0:
        return None
    warmup_steps = int(num_training_steps * warmup_ratio)
    return get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)


def save_model(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, save_dir: str) -> None:
    """모델과 토크나이저 저장"""
    os.makedirs(save_dir, exist_ok=True)
    torch.save(model.state_dict(), f"{save_dir}/best_model.pt")
    tokenizer.save_pretrained(save_dir)


def load_checkpoint(model: PreTrainedModel, checkpoint_path: str, device: torch.device = None) -> PreTrainedModel:
    """체크포인트에서 모델 가중치 로드"""
    if device is None:
        device = DEVICE
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    return model


print("Model functions defined.")

In [None]:
# =============================================================================
# Cell 10: Training 정의
# =============================================================================

import os
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import PreTrainedModel


def train_epoch(model: PreTrainedModel, dataloader: DataLoader, optimizer, scheduler=None, device=None) -> float:
    """한 에폭 학습 - 평균 손실 반환"""
    if device is None:
        device = DEVICE

    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        if scheduler is not None:
            scheduler.step()

    return total_loss / len(dataloader)


def train(
    model: PreTrainedModel,
    train_loader: DataLoader,
    dev_loader: DataLoader,
    optimizer,
    epochs: int,
    scheduler=None,
    early_stopping_patience: int = None,
    checkpoint_dir: str = None,
    device=None,
) -> dict:
    """전체 학습 - 결과 dict 반환"""
    if device is None:
        device = DEVICE

    best_accuracy = 0
    best_epoch = 0
    patience_counter = 0

    for epoch in range(epochs):
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
        dev_accuracy, _, _ = evaluate(model, dev_loader, device)

        print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Dev Accuracy: {dev_accuracy:.4f}")

        if dev_accuracy > best_accuracy:
            best_accuracy = dev_accuracy
            best_epoch = epoch + 1
            patience_counter = 0

            if checkpoint_dir:
                os.makedirs(checkpoint_dir, exist_ok=True)
                torch.save(model.state_dict(), f"{checkpoint_dir}/best_model.pt")
        else:
            patience_counter += 1

        if early_stopping_patience and patience_counter >= early_stopping_patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    return {"best_accuracy": best_accuracy, "best_epoch": best_epoch}


print("Training functions defined.")

In [None]:
# =============================================================================
# Cell 11: Evaluation & Prediction 정의
# =============================================================================

import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from transformers import PreTrainedModel


# --- Evaluation ---

def evaluate(model: PreTrainedModel, dataloader: DataLoader, device: torch.device = None) -> tuple:
    """모델 평가 - (accuracy, predictions, true_labels) 반환"""
    if device is None:
        device = DEVICE

    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels.numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy, predictions, true_labels


def get_classification_report(true_labels: list, predictions: list) -> str:
    """Classification report 문자열 반환"""
    return classification_report(true_labels, predictions, target_names=list(LABEL2ID.keys()))


# --- Prediction ---

def predict(model: PreTrainedModel, dataloader: DataLoader, device: torch.device = None) -> list:
    """모델 예측 - 예측 ID 리스트 반환"""
    if device is None:
        device = DEVICE

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            predictions.extend(preds)

    return predictions


def create_submission(predictions: list, id2label: dict = None) -> pd.DataFrame:
    """submission DataFrame 생성"""
    if id2label is None:
        id2label = ID2LABEL
    pred_labels = [id2label[p] for p in predictions]
    return pd.DataFrame({"id": range(len(pred_labels)), "label": pred_labels})


def save_submission(submission_df: pd.DataFrame, output_path: str) -> None:
    """submission CSV 저장"""
    output_dir = os.path.dirname(output_path)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    submission_df.to_csv(output_path, index=False)


print("Evaluation & Prediction functions defined.")

In [None]:
# =============================================================================
# Cell 12: 실행 - Config, Data, Model, DataLoader
# =============================================================================

# Config 로드
PRESET = "default"
config = get_config(PRESET)
print(f"Preset: {PRESET}")
print(f"Config: {config}")
print(f"Device: {DEVICE}")

# Data 로드
train_df = load_data(f"{DATA_DIR}/{config['train_file']}", preprocess=config["preprocess"])
dev_df = load_data(f"{DATA_DIR}/{config['dev_file']}", preprocess=config["preprocess"])
test_df = load_data(f"{DATA_DIR}/{config['test_file']}", preprocess=config["preprocess"])
print(f"\nData loaded - Train: {len(train_df)}, Dev: {len(dev_df)}, Test: {len(test_df)}")

# Tokenizer 로드
tokenizer = load_tokenizer(config["model_name"])
print(f"Tokenizer loaded: {config['model_name']}")

# Model 로드
model = load_model(config["model_name"], classifier_dropout=config["classifier_dropout"])
print(f"Model loaded: {config['model_name']}")

# DataLoader 생성
train_loader, dev_loader, test_loader = create_dataloaders(
    train_df, dev_df, tokenizer, config["max_length"], config["batch_size"], test_df
)
print(f"\nDataLoaders created - Train batches: {len(train_loader)}, Dev batches: {len(dev_loader)}, Test batches: {len(test_loader)}")

In [None]:
# =============================================================================
# Cell 13: 실행 - Training & Save
# =============================================================================

import json

# Optimizer & Scheduler 생성
optimizer = create_optimizer(model, config["learning_rate"], config["weight_decay"])
scheduler = create_scheduler(optimizer, len(train_loader) * config["epochs"], config["warmup_ratio"])

# Training
print("=" * 50)
print("Training started...")
print("=" * 50)

result = train(
    model, train_loader, dev_loader, optimizer, config["epochs"],
    scheduler=scheduler,
    early_stopping_patience=config["early_stopping_patience"],
    checkpoint_dir=CHECKPOINT_DIR
)

print("=" * 50)
print(f"Training completed! Best Accuracy: {result['best_accuracy']:.4f} (Epoch {result['best_epoch']})")
print("=" * 50)

# Tokenizer & Config 저장
try:
    tokenizer.save_pretrained(CHECKPOINT_DIR)
except TypeError:
    pass  # KoBertTokenizer doesn't support filename_prefix argument

with open(f"{CHECKPOINT_DIR}/config.json", "w") as f:
    json.dump(config, f, indent=2)

print(f"Tokenizer & Config saved to: {CHECKPOINT_DIR}")

In [None]:
# =============================================================================
# Cell 14: 실행 - Evaluate, Predict & Submission 저장
# =============================================================================

# Best 모델 로드
model = load_checkpoint(model, f"{CHECKPOINT_DIR}/best_model.pt")
print("Best model loaded from checkpoint.")

# Dev set 평가
print("\n" + "=" * 50)
print("Evaluation on Dev set")
print("=" * 50)
accuracy, preds, labels = evaluate(model, dev_loader)
print(f"\nDev Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(get_classification_report(labels, preds))

# Test set 예측
print("\n" + "=" * 50)
print("Prediction on Test set")
print("=" * 50)
test_preds = predict(model, test_loader)

# Submission 저장
submission_df = create_submission(test_preds)
submission_path = f"{OUTPUT_DIR}/submission.csv"
save_submission(submission_df, submission_path)
print(f"\nSubmission saved: {submission_path}")

# 결과 미리보기
print("\nSubmission preview:")
display(submission_df.head(10))

# Colab에서 다운로드
try:
    from google.colab import files
    files.download(submission_path)
    print(f"\n[Download] {submission_path} 다운로드가 시작됩니다.")
except ImportError:
    print(f"\n[Info] Colab 환경이 아닙니다. {submission_path}에서 파일을 확인하세요.")