# LIAR Baseline Training v2 (6-class DistilBERT)
**Dataset:** LIAR (10,269 train / 1,284 valid / 1,283 test)
**Task:** 6-class political claim classification
**Labels:** pants-fire, false, barely-true, half-true, mostly-true, true

### v2 개선사항
- metadata(speaker, subject, context) 활용하여 입력 강화
- learning rate scheduler 개선 (cosine)
- epoch 5로 증가 + early stopping
- 클래스 불균형 대응 (label smoothing)

> Runtime → GPU (T4) 설정 필수

## 1. Setup

In [None]:
!pip install -q transformers datasets accelerate scikit-learn safetensors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/Fakenews-detect'
os.makedirs(f'{PROJECT_DIR}/models/liar_baseline', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/data/liar', exist_ok=True)
print('Project dir:', PROJECT_DIR)
print('Files in data/liar:', os.listdir(f'{PROJECT_DIR}/data/liar') if os.path.exists(f'{PROJECT_DIR}/data/liar') else 'NOT FOUND')

## 2. Upload Data
`data/liar/train.jsonl`, `valid.jsonl`, `test.jsonl`을 Google Drive의 `Fakenews-detect/data/liar/`에 업로드하세요.

또는 아래 셀로 직접 업로드:

In [None]:
# Option A: 로컬에서 직접 업로드
from google.colab import files
import shutil

data_dir = f'{PROJECT_DIR}/data/liar'
needed = ['train.jsonl', 'valid.jsonl', 'test.jsonl']
missing = [f for f in needed if not os.path.exists(os.path.join(data_dir, f))]

if missing:
    print(f'Missing files: {missing}')
    print('Upload them now:')
    uploaded = files.upload()
    for fname, content in uploaded.items():
        dest = os.path.join(data_dir, fname)
        with open(dest, 'wb') as f:
            f.write(content)
        print(f'  Saved: {dest}')
else:
    print('All data files present!')

for f in needed:
    p = os.path.join(data_dir, f)
    if os.path.exists(p):
        with open(p, 'r') as fh:
            n = sum(1 for _ in fh)
        print(f'  {f}: {n:,} samples')

## 3. Dataset & Model

In [None]:
import torch
import json
import ast
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

LABEL_NAMES = ['pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true']
LABEL_MAP = {name: i for i, name in enumerate(LABEL_NAMES)}

class LIARDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=128, use_metadata=True):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.use_metadata = use_metadata
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.samples.append(json.loads(line))
        print(f'Loaded {len(self.samples)} samples from {data_path}')

        # Label distribution
        from collections import Counter
        dist = Counter(s.get('label_class', '?') for s in self.samples)
        print(f'Label distribution: {dict(dist)}')

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        claim = str(item['text'])

        # v2: metadata 활용 (speaker, subject, context)
        if self.use_metadata:
            meta = item.get('metadata', '{}')
            if isinstance(meta, str):
                try:
                    meta = ast.literal_eval(meta)
                except:
                    meta = {}
            speaker = meta.get('speaker', '')
            subject = meta.get('subject', '')
            context = meta.get('context', '')

            parts = [claim]
            if speaker:
                parts.append(f'Speaker: {speaker}')
            if subject:
                parts.append(f'Subject: {subject}')
            if context:
                parts.append(f'Context: {context}')
            text = ' [SEP] '.join(parts)
        else:
            text = claim

        label_str = item.get('label_class', 'false')
        label_id = LABEL_MAP.get(label_str, 1)

        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro')
    }

print('Device:', 'cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

## 4. Train

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
DATA_DIR = f'{PROJECT_DIR}/data/liar'
OUTPUT_DIR = f'{PROJECT_DIR}/models/liar_baseline'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)

train_dataset = LIARDataset(f'{DATA_DIR}/train.jsonl', tokenizer, max_len=192, use_metadata=True)
valid_dataset = LIARDataset(f'{DATA_DIR}/valid.jsonl', tokenizer, max_len=192, use_metadata=True)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=3e-5,
    lr_scheduler_type='cosine',
    label_smoothing_factor=0.1,
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to='none',
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print(f'Training: {len(train_dataset):,} samples, 5 epochs (with early stopping)')
print(f'v2 improvements: metadata input, cosine LR, label smoothing 0.1')
trainer.train()

## 5. Evaluate on Test Set

In [None]:
test_dataset = LIARDataset(f'{DATA_DIR}/test.jsonl', tokenizer, max_len=192, use_metadata=True)
results = trainer.evaluate(test_dataset)
print('Test Results:', results)

# Detailed report
preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

print('\n=== Classification Report ===')
print(classification_report(labels, preds, target_names=LABEL_NAMES))

cm = confusion_matrix(labels, preds)
print('=== Confusion Matrix ===')
print(cm)

# Visualize confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title(f'LIAR Test - Acc: {results["eval_accuracy"]:.3f} / F1: {results["eval_f1_macro"]:.3f}')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/confusion_matrix.png', dpi=150)
plt.show()

## 6. Save Model

In [None]:
# Save final model
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save label map
with open(f'{OUTPUT_DIR}/label_map.json', 'w') as f:
    json.dump(LABEL_MAP, f)

# Save test results
with open(f'{OUTPUT_DIR}/test_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f'Model saved to {OUTPUT_DIR}')
print('Files:', os.listdir(OUTPUT_DIR))

## 7. Download Model (Optional)
Google Drive에 이미 저장되어 있으므로, 로컬로 복사하면 됩니다.

또는 zip으로 다운로드:

In [None]:
!cd "{OUTPUT_DIR}" && zip -r /content/liar_baseline.zip config.json model.safetensors tokenizer.json tokenizer_config.json special_tokens_map.json vocab.txt label_map.json test_results.json 2>/dev/null || echo 'Some files may not exist yet'
from google.colab import files
files.download('/content/liar_baseline.zip')