# FakeNewsNet / WELFake Training v2 (2-class DistilBERT)
**Dataset:** WELFake (57,707 train / 14,427 test)
**Task:** 2-class fake news detection
**Labels:** fake (0), real (1)

### v2 개선사항
- cosine LR scheduler
- early stopping (patience=2)
- confusion matrix 시각화

> 현재 모델 성능: Acc 99.2% / F1 99.2% (이미 우수)
> Runtime → GPU (T4) 설정 필수

## 1. Setup

In [None]:
!pip install -q transformers datasets accelerate scikit-learn safetensors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/Fakenews-detect'
os.makedirs(f'{PROJECT_DIR}/models/fakenewsnet_baseline', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/data/welfake', exist_ok=True)
print('Project dir:', PROJECT_DIR)

## 2. Upload Data
`data/welfake/train.jsonl`, `test.jsonl`을 Google Drive의 `Fakenews-detect/data/welfake/`에 업로드하세요.

In [None]:
from google.colab import files

data_dir = f'{PROJECT_DIR}/data/welfake'
needed = ['train.jsonl', 'test.jsonl']
missing = [f for f in needed if not os.path.exists(os.path.join(data_dir, f))]

if missing:
    print(f'Missing: {missing}. Upload now:')
    uploaded = files.upload()
    for fname, content in uploaded.items():
        dest = os.path.join(data_dir, fname)
        with open(dest, 'wb') as f:
            f.write(content)
        print(f'  Saved: {dest}')
else:
    print('All data files present!')

for f in needed:
    p = os.path.join(data_dir, f)
    if os.path.exists(p):
        with open(p, 'r') as fh:
            n = sum(1 for _ in fh)
        print(f'  {f}: {n:,} samples')

## 3. Dataset & Model

In [None]:
import torch
import json
import numpy as np
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

LABEL_NAMES = ['fake', 'real']
LABEL_MAP = {'fake': 0, 'real': 1}

class FNNDataset(Dataset):
    def __init__(self, data_path, tokenizer, max_len=256):
        self.samples = []
        self.tokenizer = tokenizer
        self.max_len = max_len
        with open(data_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.samples.append(json.loads(line))
        print(f'Loaded {len(self.samples)} samples from {data_path}')

        from collections import Counter
        dist = Counter(s.get('label_class', '?') for s in self.samples)
        print(f'Label distribution: {dict(dist)}')

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = self.samples[idx]
        text = str(item.get('text', '') or item.get('title', ''))
        label_str = item.get('label_class', 'fake')
        label_id = LABEL_MAP.get(label_str, 0)

        encoding = self.tokenizer(
            text, max_length=self.max_len, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1_macro': f1_score(labels, preds, average='macro')
    }

print('Device:', 'cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))

## 4. Train

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
DATA_DIR = f'{PROJECT_DIR}/data/welfake'
OUTPUT_DIR = f'{PROJECT_DIR}/models/fakenewsnet_baseline'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

train_dataset = FNNDataset(f'{DATA_DIR}/train.jsonl', tokenizer, max_len=256)
test_dataset = FNNDataset(f'{DATA_DIR}/test.jsonl', tokenizer, max_len=256)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_ratio=0.06,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type='cosine',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    report_to='none',
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print(f'Training: {len(train_dataset):,} samples, 3 epochs (with early stopping)')
trainer.train()

## 5. Evaluate on Test Set

In [None]:
results = trainer.evaluate(test_dataset)
print('Test Results:', results)

preds_output = trainer.predict(test_dataset)
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

print('\n=== Classification Report ===')
print(classification_report(labels, preds, target_names=LABEL_NAMES))

cm = confusion_matrix(labels, preds)
print('=== Confusion Matrix ===')
print(cm)

# Visualize
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=LABEL_NAMES, yticklabels=LABEL_NAMES, ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
ax.set_title(f'FNN Test - Acc: {results["eval_accuracy"]:.3f} / F1: {results["eval_f1_macro"]:.3f}')
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/confusion_matrix.png', dpi=150)
plt.show()

## 6. Save Model

In [None]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

with open(f'{OUTPUT_DIR}/label_map.json', 'w') as f:
    json.dump(LABEL_MAP, f)

with open(f'{OUTPUT_DIR}/test_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f'Model saved to {OUTPUT_DIR}')
print('Files:', os.listdir(OUTPUT_DIR))

## 7. Download Model (Optional)

In [None]:
!cd "{OUTPUT_DIR}" && zip -r /content/fakenewsnet_baseline.zip config.json model.safetensors tokenizer.json tokenizer_config.json special_tokens_map.json vocab.txt label_map.json test_results.json 2>/dev/null || echo 'Some files may not exist yet'
from google.colab import files
files.download('/content/fakenewsnet_baseline.zip')