# FEVER Baseline Training v2 (3-class DistilBERT)
**Dataset:** FEVER (145,449 train, augmented with evidence)
**Task:** 3-class claim verification
**Labels:** SUPPORTS, REFUTES, NOT ENOUGH INFO

### v2 개선사항
- cosine LR scheduler + warmup ratio
- epoch 4 + early stopping (patience=2)
- label smoothing 0.05
- confusion matrix 시각화

> Runtime → GPU (T4) 설정 필수

## 1. Setup

In [None]:
!pip install -q transformers datasets accelerate scikit-learn safetensors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/Fakenews-detect'
os.makedirs(f'{PROJECT_DIR}/models/fever_baseline', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/data/fever', exist_ok=True)
print('Project dir:', PROJECT_DIR)
print('Files in data/fever:', [f for f in os.listdir(f'{PROJECT_DIR}/data/fever') if f.endswith('.jsonl')] if os.path.exists(f'{PROJECT_DIR}/data/fever') else 'NOT FOUND')

## 2. Upload Data
`data/fever/train_normalized.jsonl`을 Google Drive의 `Fakenews-detect/data/fever/`에 업로드하세요.

(augmented 버전이 있으면 `train_augmented.jsonl`을 우선 사용합니다)

In [None]:
from google.colab import files

data_dir = f'{PROJECT_DIR}/data/fever'
aug_path = os.path.join(data_dir, 'train_augmented.jsonl')
norm_path = os.path.join(data_dir, 'train_normalized.jsonl')

if os.path.exists(aug_path):
    TRAIN_PATH = aug_path
    print(f'Using augmented data: {aug_path}')
elif os.path.exists(norm_path):
    TRAIN_PATH = norm_path
    print(f'Using normalized data: {norm_path}')
else:
    print('No FEVER data found. Upload train_normalized.jsonl:')
    uploaded = files.upload()
    for fname, content in uploaded.items():
        dest = os.path.join(data_dir, fname)
        with open(dest, 'wb') as f:
            f.write(content)
        print(f'  Saved: {dest}')
    TRAIN_PATH = os.path.join(data_dir, list(uploaded.keys())[0])

with open(TRAIN_PATH, 'r') as f:
    n = sum(1 for _ in f)
print(f'Train samples: {n:,}')

## 3. Dataset & Model

In [None]:
import torch
import json
import numpy as np
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

LABEL_NAMES = ['SUPPORTS', 'REFUTES', 'NOT ENOUGH INFO']
LABEL_MAP = {name: i for i, name in enumerate(LABEL_NAMES)}

class FEVERDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=256):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.samples.append(json.loads(line))
        print(f'Loaded {len(self.samples)} samples from {data_path}')

        from collections import Counter
        dist = Counter(s.get('label_class', '?') for s in self.samples)
        print(f'Label distribution: {dict(dist)}')

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        claim = str(item['text'])

        evidence_texts = item.get('evidence_texts', [])
        if evidence_texts:
            input_text = claim + ' [SEP] ' + ' [SEP] '.join(evidence_texts[:3])
        else:
            input_text = claim

        label_str = item.get('label_class', 'NOT ENOUGH INFO')
        label_id = LABEL_MAP.get(label_str, 2)

        encoding = self.tokenizer(
            input_text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro')
    }

print('Device:', 'cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

## 4. Train

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
OUTPUT_DIR = f'{PROJECT_DIR}/models/fever_baseline'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# Load and split
full_dataset = FEVERDataset(TRAIN_PATH, tokenizer, max_len=256)

val_path = os.path.join(data_dir, 'validation_normalized.jsonl')
if os.path.exists(val_path):
    train_dataset = full_dataset
    valid_dataset = FEVERDataset(val_path, tokenizer, max_len=256)
    print(f'Using separate validation file: {len(valid_dataset)} samples')
else:
    total = len(full_dataset)
    train_size = int(0.9 * total)
    val_size = total - train_size
    train_dataset, valid_dataset = random_split(full_dataset, [train_size, val_size],
                                                 generator=torch.Generator().manual_seed(42))
    print(f'Split: train={train_size:,}, valid={val_size:,}')

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_ratio=0.06,
    weight_decay=0.01,
    learning_rate=3e-5,
    lr_scheduler_type='cosine',
    label_smoothing_factor=0.05,
    logging_steps=200,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to='none',
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print(f'Training: {len(train_dataset):,} samples, 4 epochs (with early stopping)')
print(f'v2 improvements: cosine LR, label smoothing 0.05')
trainer.train()

## 5. Evaluate

In [None]:
results = trainer.evaluate(valid_dataset)
print('Validation Results:', results)

# Detailed report
preds_output = trainer.predict(valid_dataset)
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

print('\n=== Classification Report ===')
print(classification_report(labels, preds, target_names=LABEL_NAMES))

cm = confusion_matrix(labels, preds)
print('=== Confusion Matrix ===')
print(cm)

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title(f'FEVER Eval - Acc: {results["eval_accuracy"]:.3f} / F1: {results["eval_f1_macro"]:.3f}')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/confusion_matrix.png', dpi=150)
plt.show()

## 6. Save Model

In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

with open(f'{OUTPUT_DIR}/label_map.json', 'w') as f:
    json.dump(LABEL_MAP, f)

with open(f'{OUTPUT_DIR}/eval_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f'Model saved to {OUTPUT_DIR}')
print('Files:', os.listdir(OUTPUT_DIR))

## 7. Download Model (Optional)

In [None]:
!cd "{OUTPUT_DIR}" && zip -r /content/fever_baseline.zip config.json model.safetensors tokenizer.json tokenizer_config.json special_tokens_map.json vocab.txt label_map.json eval_results.json 2>/dev/null || echo 'Some files may not exist yet'
from google.colab import files
files.download('/content/fever_baseline.zip')