In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = "yhavinga/Bor-1B"
model_kwargs = {
    "device_map": "auto",
    "torch_dtype": torch.bfloat16,
    "trust_remote_code": True,
    "use_flash_attention_2": False,
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(MODEL, **model_kwargs)

In [3]:
# Be sure to add the eos token to the tokenizer
tokenizer = AutoTokenizer.from_pretrained("yhavinga/dutch-llama-tokenizer", add_eos_token=True)
tokenizer("Aap")


{'input_ids': [1, 330, 377, 2], 'attention_mask': [1, 1, 1, 1]}

Let's explore language model evaluation metrics grounded in information theory. We'll focus on:
1. Token-level perplexity = exp(-1/T ∑log P(token_i|token_{<i}, θ))
2. Bits per Word (BPW) = -1/W ∑log₂ P(token_i|token_{<i}, θ)

These metrics help us understand how well our model predicts text sequences.

In [4]:
sentence = "Een grote zak beukennoten."

inputs = tokenizer(sentence, return_tensors="pt").to(model.device)
input_ids = inputs.input_ids[0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

with torch.no_grad():
    outputs = model(**inputs)

# Get conditional log probabilities for each token
logits = outputs.logits[0]  # shape: [sequence_length, vocab_size]
conditional_probs = torch.nn.functional.softmax(logits, dim=1)

# conditional_probs[i] contains predictions for token i+1
token_metrics = {}
for i in range(len(tokens)-1):  # -1 because we don't predict after EOS
    next_token_id = input_ids[i + 1]
    conditional_prob = conditional_probs[i, next_token_id].item()
    conditional_log_prob = torch.log(torch.tensor(conditional_prob)).item()
    token_metrics[tokens[i+1]] = {
        "conditional_prob": conditional_prob,
        "conditional_log_prob": conditional_log_prob
    }

print("Token-level conditional probabilities and log probabilities:")
for token, metrics in token_metrics.items():
    print(f"{token:12} → P(token|context)={metrics['conditional_prob']:.6f} "
          f"(log P={metrics['conditional_log_prob']:.4f})")

  return F.linear(input, self.weight, self.bias)


Token-level conditional probabilities and log probabilities:
▁Een         → P(token|context)=0.017578 (log P=-4.0411)
▁grote       → P(token|context)=0.002365 (log P=-6.0469)
▁zak         → P(token|context)=0.000362 (log P=-7.9228)
▁be          → P(token|context)=0.000083 (log P=-9.3913)
uken         → P(token|context)=0.233398 (log P=-1.4550)
n            → P(token|context)=0.388672 (log P=-0.9450)
oten         → P(token|context)=0.164062 (log P=-1.8075)
.            → P(token|context)=0.022827 (log P=-3.7798)
</s>         → P(token|context)=0.000122 (log P=-9.0148)


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  attn_output = torch.nn.functional.scaled_dot_product_attention(



Let's quickly recap the core idea from Shannon's information theory. The information content of an event is inversely proportional to its probability. A highly probable event, like a fair coin landing heads, carries little information - just 1 bit. A rare event, like a specific word with a probability of 0.001, carries much more information, around 10 bits in this example. This is quantified using the negative base-2 logarithm of the probability.

In [5]:
def information_content(probability):
  """Calculates the information content of an event.

  Args:
    probability: The probability of the event (0 < probability <= 1).

  Returns:
    The information content in bits.
  """
  return -np.log2(probability)

# Example
probability_of_heads = 0.5
ic_heads = information_content(probability_of_heads)
print(f"Information content of a fair coin flip (heads): {ic_heads:.2f} bits")

probability_of_rare_word = 0.001
ic_rare_word = information_content(probability_of_rare_word)
print(f"Information content of a rare word (p=0.001): {ic_rare_word:.2f} bits")

Information content of a fair coin flip (heads): 1.00 bits
Information content of a rare word (p=0.001): 9.97 bits


For language models:
- Each token prediction has an information content based on its conditional probability
- BPW measures average information content per word
- Lower BPW indicates better predictions (less "surprise" per word)

To calculate BPW:
1. Sum conditional log probabilities (in base e) to get the log likelihood - ∑log P(token_i|token_{<i}, θ)
2. Convert to base 2 by dividing by log(2) 
3. Normalize by number of words

This gives us the average number of bits needed per word according to our model.

In [6]:
def calculate_sequence_metrics(text, token_metrics):
    """
    Calculates sequence-level evaluation metrics.
    
    Returns:
        tuple: (log_likelihood, num_tokens, log_likelihood_bits, num_words)
    """
    # Sum conditional log probabilities for sequence log-likelihood
    log_likelihood = sum(m['conditional_log_prob'] for m in token_metrics.values())
    
    # Convert to bits (log₂(P) = log_e(P)/log_e(2))
    log_likelihood_bits = -log_likelihood / np.log(2.0)
    
    num_tokens = len(token_metrics)
    num_words = len(text.split())
    
    return log_likelihood, num_tokens, log_likelihood_bits, num_words

log_likelihood, num_tokens, log_likelihood_bits, num_words = calculate_sequence_metrics(
    sentence, token_metrics
)

# Calculate metrics
token_perplexity = np.exp(-log_likelihood / num_tokens)
bits_per_word = log_likelihood_bits / num_words

print(f"Sequence Metrics:")
print(f"Token-level perplexity: {token_perplexity:.2f}")
print(f"Bits per word: {bits_per_word:.2f}")

Sequence Metrics:
Token-level perplexity: 138.91
Bits per word: 16.02
