In [None]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from accelerate.test_utils.testing import get_backend

device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
model_id = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

Token indices sequence length is longer than the specified maximum sequence length for this model (287644 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
test

Dataset({
    features: ['text'],
    num_rows: 4358
})

In [None]:
encodings

{'input_ids': tensor([[ 628,  796, 5199,  ...,  220,  628,  198]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [None]:
import torch
from tqdm import tqdm

max_length = model.config.n_positions # 1024
stride = 512
seq_len = encodings.input_ids.size(1) # 287644

nll_sum = 0.0
n_tokens = 0
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    # Accumulate the total negative log-likelihood and the total number of tokens
    num_valid_tokens = (target_ids != -100).sum().item()  # number of valid tokens in target_ids
    batch_size = target_ids.size(0)
    num_loss_tokens = num_valid_tokens - batch_size  # subtract batch_size due to internal label shift
    nll_sum += neg_log_likelihood * num_loss_tokens
    n_tokens += num_loss_tokens

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
ppl = torch.exp(avg_nll)

100%|█████████▉| 560/562 [04:50<00:01,  1.93it/s]


In [None]:
ppl

tensor(16.4443, device='cuda:0')

In [None]:
print(n_tokens)
print(seq_len)

例えば、ある予測トークンの確率分布が[語彙1, 語彙2, 語彙3]=[0.7, 0.2, 0.1]だったら、

そのnegative_log_likelihood(負の対数尤度)は、

-log(p) (pは予測したトークンの出力確率/尤度) = -log(0.7)

その予測トークンのクロスエントロピー損失は、

-1×log(0.7) + -0×log(0.2) + -0×log(0.1) = -log(0.7)

だから、1つのトークンに関しては、

負の対数尤度 = クロスエントロピー損失 = 自信のなさ

そして、perplexityは、

PPL(X) = exp(負の対数尤度の平均) = exp(クロスエントロピー損失の平均)

損失計算の正解ラベルに使っている文章がLLM自身の出力だから、

計算した損失は、LLMが自分の答えに対してどれくらい自信がないか、つまり困惑しているかを表しているということ！