# Fine-tune PhoBERT cho Fact Checking
Dataset: ise-dsc01 (Vietnamese Fact Verification)

## 1. C√†i ƒë·∫∑t th∆∞ vi·ªán

In [None]:
!pip install transformers datasets accelerate -q
!pip install sentencepiece -q

## 2. Import libraries

In [None]:
import json
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 3. C·∫•u h√¨nh

In [None]:
class Config:
    MODEL_NAME = "vinai/phobert-base"
    MAX_LENGTH = 256
    BATCH_SIZE = 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 5
    WARMUP_STEPS = 500
    WEIGHT_DECAY = 0.01
    
    # C·∫≠p nh·∫≠t ƒë∆∞·ªùng d·∫´n dataset c·ªßa b·∫°n
    TRAIN_PATH = "/kaggle/input/your-dataset/ise-dsc01-train.json"
    WARMUP_PATH = "/kaggle/input/your-dataset/ise-dsc01-warmup.json"
    
    LABEL2ID = {"SUPPORTED": 0, "REFUTED": 1, "NEI": 2}
    ID2LABEL = {0: "SUPPORTED", 1: "REFUTED", 2: "NEI"}
    NUM_LABELS = 3

## 4. Load v√† kh√°m ph√° d·ªØ li·ªáu

In [None]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    records = []
    for key, value in data.items():
        records.append({
            'id': key,
            'context': value['context'],
            'claim': value['claim'],
            'verdict': value['verdict'],
            'evidence': value.get('evidence', ''),
            'domain': value.get('domain', '')
        })
    
    return pd.DataFrame(records)

# Load data
df_train = load_data(Config.TRAIN_PATH)
print(f"T·ªïng s·ªë m·∫´u: {len(df_train)}")
print(f"\nPh√¢n b·ªë nh√£n:")
print(df_train['verdict'].value_counts())
print(f"\nPh√¢n b·ªë domain:")
print(df_train['domain'].value_counts())

# Xem m·∫´u d·ªØ li·ªáu
df_train.head()

## 5. Chu·∫©n b·ªã Dataset

In [None]:
def prepare_input_text(row):
    """K·∫øt h·ª£p claim v√† context"""
    text = f"Claim: {row['claim']} Context: {row['context']}"
    return text

class FactCheckDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, label2id):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = label2id
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = prepare_input_text(row)
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        label = self.label2id[row['verdict']]
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 6. Chia train/validation

In [None]:
train_df, val_df = train_test_split(
    df_train, 
    test_size=0.15, 
    random_state=42,
    stratify=df_train['verdict']
)

print(f"Train set: {len(train_df)} samples")
print(f"Validation set: {len(val_df)} samples")
print(f"\nTrain label distribution:")
print(train_df['verdict'].value_counts())
print(f"\nValidation label distribution:")
print(val_df['verdict'].value_counts())

## 7. Load PhoBERT

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    Config.MODEL_NAME,
    num_labels=Config.NUM_LABELS,
    id2label=Config.ID2LABEL,
    label2id=Config.LABEL2ID
)

print("‚úì PhoBERT loaded successfully")
print(f"Model parameters: {model.num_parameters():,}")

## 8. T·∫°o datasets

In [None]:
train_dataset = FactCheckDataset(train_df, tokenizer, Config.MAX_LENGTH, Config.LABEL2ID)
val_dataset = FactCheckDataset(val_df, tokenizer, Config.MAX_LENGTH, Config.LABEL2ID)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

# Test dataset
sample = train_dataset[0]
print(f"\nSample input_ids shape: {sample['input_ids'].shape}")
print(f"Sample attention_mask shape: {sample['attention_mask'].shape}")
print(f"Sample label: {sample['labels']}")

## 9. ƒê·ªãnh nghƒ©a metrics

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average='macro')
    f1_weighted = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }

## 10. Training arguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=Config.NUM_EPOCHS,
    per_device_train_batch_size=Config.BATCH_SIZE,
    per_device_eval_batch_size=Config.BATCH_SIZE,
    learning_rate=Config.LEARNING_RATE,
    weight_decay=Config.WEIGHT_DECAY,
    warmup_steps=Config.WARMUP_STEPS,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

## 11. Kh·ªüi t·∫°o Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("‚úì Trainer initialized")

## 12. Training

In [None]:
print("B·∫Øt ƒë·∫ßu training...")
trainer.train()

## 13. Evaluation

In [None]:
eval_results = trainer.evaluate()
print("\nüìä K·∫øt qu·∫£ tr√™n validation set:")
for key, value in eval_results.items():
    print(f"  {key}: {value:.4f}")

## 14. Chi ti·∫øt classification report

In [None]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

print(classification_report(
    labels, 
    preds, 
    target_names=list(Config.LABEL2ID.keys())
))

## 15. L∆∞u model

In [None]:
model.save_pretrained('./phobert_factcheck_model')
tokenizer.save_pretrained('./phobert_factcheck_model')
print("‚úì Model ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: ./phobert_factcheck_model")

## 16. Test inference

In [None]:
def predict_claim(claim, context, model, tokenizer, device='cuda'):
    model.eval()
    text = f"Claim: {claim} Context: {context}"
    
    encoding = tokenizer(
        text,
        max_length=Config.MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_label = torch.argmax(probs, dim=1).item()
    
    return Config.ID2LABEL[pred_label], probs[0].cpu().numpy()

# Test v·ªõi m·ªôt m·∫´u
test_sample = val_df.iloc[0]
pred_label, probs = predict_claim(
    test_sample['claim'], 
    test_sample['context'], 
    model, 
    tokenizer
)

print(f"Claim: {test_sample['claim'][:100]}...")
print(f"\nTrue label: {test_sample['verdict']}")
print(f"Predicted: {pred_label}")
print(f"\nProbabilities:")
for label, prob in zip(Config.LABEL2ID.keys(), probs):
    print(f"  {label}: {prob:.4f}")