In [1]:
import sys

# Core libraries
!{sys.executable} -m pip install torch==2.6 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!{sys.executable} -m pip install transformers==4.55.2
!{sys.executable} -m pip install pandas scikit-learn tqdm
!{sys.executable} -m pip install kagglehub
!{sys.executable} -m pip install gdown
!{sys.executable} -m pip install SentencePiece
!{sys.executable} -m pip install tiktoken protobuf


Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Could not find a version that satisfies the requirement nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64" (from torch) (from versions: 12.1.105, 12.9.41, 12.9.86)[0m[31m
[0m[31mERROR: No matching distribution found for nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64"[0m[31m
[0mCollecting transformers==4.55.2
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers==4.55.2)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.55.2-py3-none-any.whl (11.3

In [2]:
import warnings
warnings.filterwarnings('ignore')

import logging
logging.disable(logging.WARNING)

In [3]:
import transformers
print(transformers.__version__)

4.55.2


In [None]:
import gdown
gdown.download(id='')
gdown.download(id='')
gdown.download(id='')

In [3]:
import os
import shutil
import json
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
from sklearn.metrics import classification_report

# -----------------------------
# 1) DATASET DEFINITION
# -----------------------------
class SentencePairDataset(Dataset):
    def __init__(self, sentence_pairs, labels, tokenizer, max_length):
        """
        Args:
            sentence_pairs: List of (sentence1, sentence2) tuples
            labels: List of integer labels (0, 1, 2)
            tokenizer: Hugging Face tokenizer
            max_length: Max sequence length for tokenization
        """
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        sentence1, sentence2 = self.sentence_pairs[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            sentence1,
            text_pair=sentence2,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True,
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

# -----------------------------
# 2) DATA LOADING FUNCTION (Combining Two Datasets)
# -----------------------------
def load_data():
    valid_labels = {"Support", "Refute", "NEI"}

    train_df = pd.read_csv("train_viwiki.csv")
    # dev_df = pd.read_csv("dev_viwiki_pro.csv")
    dev_df = pd.read_csv("test_viwiki.csv")
    test_df = pd.read_csv("test_viwiki.csv")

    train_df = train_df[train_df["gold_label"].isin(valid_labels)].reset_index(drop=True)
    dev_df   = dev_df[  dev_df["gold_label"].isin(valid_labels)].reset_index(drop=True)
    test_df  = test_df[ test_df["gold_label"].isin(valid_labels)].reset_index(drop=True)

    
    label_mapping = {'Support': 0, 'Refute': 1, 'NEI': 2}
    
    for df in [train_df, dev_df, test_df]:
        df['gold_label'] = df['gold_label'].map(label_mapping)

    def prepare_df(df):
        X1 = df['claim']
        X2 = df['evidence']
        sentence_pairs = [(x1, x2) for x1, x2 in zip(X1, X2)]
        labels = list(df['gold_label'])
        return sentence_pairs, labels

    print("training dataset sizes:")
    print(f"Train: {len(train_df)} | Dev: {len(dev_df)} | Test: {len(test_df)}")

    X_train, y_train = prepare_df(train_df)
    X_dev, y_dev = prepare_df(dev_df)
    X_test, y_test = prepare_df(test_df)

    return (X_train, y_train), (X_dev, y_dev), (X_test, y_test)

# -----------------------------
# 3) DATASET PREPARATION
# -----------------------------
def prepare_datasets(data_tuple, tokenizer, max_length):
    sentence_pairs, labels = data_tuple
    return SentencePairDataset(sentence_pairs, labels, tokenizer, max_length)

# -----------------------------
# Checkpoint helper (keep top-k = 2)
# -----------------------------
def save_checkpoint(model, tokenizer, save_root, epoch, val_acc, top_models, k=2):
    """
    Save model/tokenizer to a new checkpoint directory and maintain top k checkpoints by val_acc.
    top_models is a list of tuples (val_acc, epoch, path) sorted descending by val_acc.
    This function updates top_models in-place and returns it.
    """
    # Ensure root exists
    if not os.path.exists(save_root):
        os.makedirs(save_root, exist_ok=True)

    # Create a new checkpoint folder name
    ckpt_name = f"checkpoint-epoch{epoch}-acc{val_acc:.4f}"
    ckpt_path = os.path.join(save_root, ckpt_name)

    # Save model & tokenizer
    model.save_pretrained(ckpt_path)
    tokenizer.save_pretrained(ckpt_path)

    # Save meta info
    meta = {"val_acc": float(val_acc), "epoch": int(epoch)}
    with open(os.path.join(ckpt_path, "meta.json"), "w") as f:
        json.dump(meta, f)

    # Insert into top_models and keep sorted
    top_models.append((float(val_acc), int(epoch), ckpt_path))
    # sort descending by val_acc
    top_models.sort(key=lambda x: x[0], reverse=True)

    # If more than k checkpoints, remove the worst ones (from the end)
    while len(top_models) > k:
        worst = top_models.pop(-1)
        _, _, worst_path = worst
        # remove the folder
        if os.path.exists(worst_path):
            shutil.rmtree(worst_path)

    return top_models

# -----------------------------
# 4) TRAINING LOOP (now returns top_models)
# -----------------------------
def train(model, train_loader, dev_loader, criterion, optimizer, device, epochs, tokenizer, checkpoint_root="checkpoints", top_k=2):
    """
    Train the model and keep top_k checkpoints based on dev accuracy.
    Returns:
        top_models: list of tuples (val_acc, epoch, path) sorted by val_acc desc
    """
    top_models = []  # list of (val_acc, epoch, path)
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        # ---- TRAINING PHASE ----
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Extract logits (compatibility for return_dict True/False)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)
        print(f"\nEpoch {epoch+1}/{epochs} - Average Training Loss: {avg_loss:.4f}")

        # ---- EVALUATION PHASE (DEV) ----
        model.eval()
        predictions = []
        true_labels = []
        for batch in tqdm(dev_loader, desc=f"Evaluating Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
                _, predicted = torch.max(logits, 1)
                predictions.extend(predicted.cpu().numpy().tolist())
                true_labels.extend(labels.cpu().numpy().tolist())

        # compute accuracy for checkpoint decision
        correct = sum([1 for p, t in zip(predictions, true_labels) if p == t])
        val_acc = correct / len(true_labels) if len(true_labels) > 0 else 0.0

        print(f"\nEpoch {epoch+1} - Dev accuracy: {val_acc:.4f}")
        print(f"\nClassification Report for Epoch {epoch+1}:")
        print(classification_report(true_labels, predictions, digits=4))

        # Save checkpoint if it's among top_k
        # If fewer than top_k saved yet, always save; otherwise save if val_acc greater than the current worst in top_models
        if len(top_models) < top_k or val_acc > top_models[-1][0]:
            print(f"Saving checkpoint for epoch {epoch+1} with dev acc {val_acc:.4f}")
            save_checkpoint(model, tokenizer, checkpoint_root, epoch+1, val_acc, top_models, k=top_k)
        else:
            print(f"Not saving checkpoint for epoch {epoch+1} (dev acc {val_acc:.4f})")

    # final top_models returned sorted descending by val_acc
    return top_models

# -----------------------------
# 7) EXAMPLE MAIN TRAINING SCRIPT
# -----------------------------
if __name__ == "__main__":
    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load the XLM-RoBERTa-large tokenizer and model backbone for sequence classification
    model_name = 'microsoft/infoxlm-large'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    xlmroberta_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    
    # Define sequence length and number of classes
    max_length = 512
    num_classes = 3  # already set in the model above

    # Load and append the two datasets for train, dev, and test
    (X_train, y_train), (X_dev, y_dev), (X_test, y_test) = load_data()

    # Prepare datasets
    train_dataset = prepare_datasets((X_train, y_train), tokenizer, max_length)
    dev_dataset = prepare_datasets((X_dev, y_dev), tokenizer, max_length)
    test_dataset = prepare_datasets((X_test, y_test), tokenizer, max_length)

    # Create the model and move to device
    model = xlmroberta_model
    model.to(device)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Check one batch
    batch = next(iter(train_loader))
    print("Input IDs shape:", batch["input_ids"].shape)
    print("Attention mask shape:", batch["attention_mask"].shape)
    print("Labels shape:", batch["label"].shape)

    # Define loss function & optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
    print('start training')

    # Train and capture the top models (val_acc, epoch, path)
    top_models = train(model, train_loader, dev_loader, criterion, optimizer, device, epochs=10, tokenizer=tokenizer, checkpoint_root="checkpoints", top_k=2)

    # After training, top_models contains up to 2 tuples: (val_acc, epoch, path)
    print("Top checkpoints saved (val_acc, epoch, path):")
    for item in top_models:
        print(item)


config.json:   0%|          | 0.00/513 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

2025-11-06 11:31:25.631955: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762428685.816536      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762428685.869865      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/infoxlm-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training dataset sizes:
Train: 16581 | Dev: 2091 | Test: 2091
Input IDs shape: torch.Size([8, 512])
Attention mask shape: torch.Size([8, 512])
Labels shape: torch.Size([8])
start training



Training Epoch 1:   0%|          | 0/2073 [00:00<?, ?it/s][A
Training Epoch 1:   0%|          | 1/2073 [00:02<1:28:57,  2.58s/it][A
Training Epoch 1:   0%|          | 2/2073 [00:04<1:12:05,  2.09s/it][A
Training Epoch 1:   0%|          | 3/2073 [00:06<1:06:27,  1.93s/it][A
Training Epoch 1:   0%|          | 4/2073 [00:07<1:03:45,  1.85s/it][A
Training Epoch 1:   0%|          | 5/2073 [00:09<1:02:07,  1.80s/it][A
Training Epoch 1:   0%|          | 6/2073 [00:11<1:01:05,  1.77s/it][A
Training Epoch 1:   0%|          | 7/2073 [00:12<1:00:26,  1.76s/it][A
Training Epoch 1:   0%|          | 8/2073 [00:14<59:57,  1.74s/it]  [A
Training Epoch 1:   0%|          | 9/2073 [00:16<59:44,  1.74s/it][A
Training Epoch 1:   0%|          | 10/2073 [00:18<59:33,  1.73s/it][A
Training Epoch 1:   1%|          | 11/2073 [00:19<1:02:01,  1.80s/it]A


KeyboardInterrupt: 

In [None]:

from huggingface_hub import HfApi, upload_folder

repo_id = "ICTuniverse/CafeBERT-QA-viwikifc-9321-EM"
hf_token = ""  

if not top_models:
    raise ValueError("top_models is empty. Nothing to push.")

# best model is the first element (highest EM) in top_models
best_em, best_epoch, best_path = top_models[0]
print(f"Pushing best checkpoint: EM={best_em:.4f}, epoch={best_epoch}, path={best_path}")

api = HfApi()

# create repo if it doesn't exist (set private=True if you want a private repo)
api.create_repo(repo_id=repo_id, token=hf_token, exist_ok=True)

# upload entire folder to the root of the repo
# upload_folder will handle large files via LFS if needed
upload_folder(
    folder_path=best_path,
    path_in_repo="",   
    repo_id=repo_id,
    token=hf_token,
)

print("Upload finished.")
