In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class FactCheckDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        statement = self.data.iloc[index]["Statement"]
        evidence = self.data.iloc[index]["Evidence"]
        label = self.data.iloc[index]["labels"]

        # Tokenize statement and evidence
        inputs = self.tokenizer(
            text=statement,
            text_pair=evidence,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }


In [None]:
# Load data
train = pd.read_csv('data/train_data.csv')
dev = pd.read_csv('data/dev_data.csv')
test = pd.read_csv('data/test_data.csv')

# Pre-trained model and tokenizer (e.g., PhoBERT or XLM-Roberta)
MODEL_NAME = "Fsoft-AIC/videberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets
train_dataset = FactCheckDataset(train, tokenizer, max_len=128)
dev_dataset = FactCheckDataset(dev, tokenizer, max_len=128)
test_dataset = FactCheckDataset(test, tokenizer, max_len=128)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev, batch_size=16)
test_loader = DataLoader(test, batch_size=16)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# Optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at Fsoft-AIC/videberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine

In [6]:
def train_model(model, train_loader, dev_loader, optimizer, loss_fn, epochs=5):
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            train_loss += loss.item()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}, Train Loss: {train_loss / len(train_loader)}")

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0

        with torch.no_grad():
            for batch in dev_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["label"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        val_acc = correct / total
        print(f"Validation Loss: {val_loss / len(dev_loader)}, Validation Accuracy: {val_acc}")


In [7]:
train_model(model, train_loader, dev_loader, optimizer, loss_fn, epochs=3)

Epoch 1, Train Loss: 1.0976256874833574


KeyError: 0

In [None]:
def predict(model, tokenizer, statement, evidence, max_len=128):
    model.eval()
    inputs = tokenizer(
        text=statement,
        text_pair=evidence,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item()

    return prediction, probs

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Model predictions
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Classification report
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))

    # Confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Label 0', 'Label 1', 'Label 2'], yticklabels=['Label 0', 'Label 1', 'Label 2'])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.show()

    # Accuracy
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")

    return all_preds, all_labels


In [None]:
test_loader = DataLoader(test, batch_size=16)

In [None]:
# Evaluate the model
test_preds, test_labels = evaluate_model(model, test_loader)

In [None]:
def visualize_predictions(data, predictions, labels, n=5):
    for i in range(n):
        print(f"Statement: {data.iloc[i]['Statement']}")
        print(f"Evidence: {data.iloc[i]['Evidence']}")
        print(f"True Label: {labels[i]}, Predicted Label: {predictions[i]}")
        print("-" * 50)

visualize_predictions(test, test_preds, test_labels)