In [None]:
%pip install transformers

In [14]:
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [16]:
dataset = load_dataset('Rowan/hellaswag', split='validation', trust_remote_code=True)

In [None]:
@torch.no_grad()
def eval_sample(sample):
    """HellaSwag completion-style like src/evals"""
    ctx = sample['ctx']
    candidates = sample['endings']
    label = int(sample['label'])

    ctx_ids = tokenizer(ctx, add_special_tokens=False).input_ids
    candidates_ids = [tokenizer(c, add_special_tokens=False).input_ids for c in candidates]
    candidates_len = [len(ids) for ids in candidates_ids]

    sequences = [ctx_ids + candidate_ids for candidate_ids in candidates_ids]
    max_len = max(len(seq) for seq in sequences)

    padded_sequences = []
    candidate_masks = []
    for seq, cand_len in zip(sequences, candidates_len):
        pad_len = max_len - len(seq)
        padded_seq = seq + [tokenizer.eos_token_id] * pad_len
        padded_sequences.append(padded_seq)

        mask = [0] * len(ctx_ids) + [1] * cand_len + [0] * pad_len
        candidate_masks.append(mask)

    x = torch.tensor(padded_sequences, device=device)  # (4, max_len)
    mask = torch.tensor(candidate_masks, device=device)  # (4, max_len)

    out = model(x)
    logits = out.logits if hasattr(out, "logits") else out[0]

    shifted_logits = logits[:, :-1, :]  # (batch, max_len-1, vocab_size)
    shifted_targets = x[:, 1:]          # (batch, max_len-1)
    shifted_mask = mask[:, 1:]          # (batch, max_len-1)

    log_probs = F.log_softmax(shifted_logits, dim=-1)
    token_log_probs = torch.gather(log_probs, dim=-1, index=shifted_targets.unsqueeze(-1)).squeeze(-1)
    candidate_scores = (token_log_probs * shifted_mask).sum(dim=1)

    most_likely = torch.argmax(candidate_scores).item()
    return most_likely == label

In [4]:
acc = 0.0
for sample in dataset:
    acc += eval_sample(sample)

accuracy = acc / len(dataset)
print(f"HellaSwag accuracy: {accuracy:.4f}")

HellaSwag accuracy: 0.2821
