In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "models/opt-babylm-100m-bpe"   # or unigram tokenizer path
)

words = ["kiwis", "wolverines", "wug", "wugs", "blurgidy"]

for w in words:
    print(w, "→", tokenizer.tokenize(w))


kiwis → ['ki', 'w', 'is']
wolverines → ['w', 'ol', 'ver', 'ines']
wug → ['w', 'ug']
wugs → ['w', 'ugs']
blurgidy → ['bl', 'ur', 'g', 'id', 'y']


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load your model and tokenizer
model_name = "znhoughton/opt-babylm-125m-seed42"  # or your local path
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()  # Set to evaluation mode

# Test text
text = "The boy went outside to fly his"

print("="*50)
print("TOKENIZATION TEST")
print("="*50)
encoded = tokenizer(text, return_tensors="pt")
print(f"Input text: '{text}'")
print(f"Token IDs: {encoded['input_ids'].tolist()[0]}")
print(f"Tokens: {tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])}")
print(f"Decoded: '{tokenizer.decode(encoded['input_ids'][0])}'")

print("\n" + "="*50)
print("SPECIAL TOKENS CHECK")
print("="*50)
print(f"BOS: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f"EOS: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"PAD: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"UNK: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")
print(f"Vocab size: {len(tokenizer)}")

print("\n" + "="*50)
print("NEXT WORD PREDICTION")
print("="*50)

with torch.no_grad():
    outputs = model(**encoded)
    logits = outputs.logits
    
    # Get predictions for the last token
    next_token_logits = logits[0, -1, :]
    
    # Get top 10 predictions
    top_k = 10
    probs = torch.softmax(next_token_logits, dim=-1)
    top_probs, top_indices = torch.topk(probs, top_k)
    
    print(f"Top {top_k} next word predictions:")
    for i, (prob, idx) in enumerate(zip(top_probs, top_indices)):
        token = tokenizer.decode([idx])
        print(f"  {i+1}. '{token}' (ID: {idx.item()}, prob: {prob.item():.4f})")

print("\n" + "="*50)
print("LOGITS FOR EACH INPUT TOKEN")
print("="*50)

with torch.no_grad():
    outputs = model(**encoded)
    logits = outputs.logits  # Shape: [batch_size, sequence_length, vocab_size]
    
    print(f"Logits shape: {logits.shape}")
    print(f"(batch_size={logits.shape[0]}, sequence_length={logits.shape[1]}, vocab_size={logits.shape[2]})")
    
    for pos in range(logits.shape[1]):
        token_id = encoded['input_ids'][0, pos].item()
        token_str = tokenizer.decode([token_id])
        
        # Get logits for this position
        position_logits = logits[0, pos, :]
        
        # Get top prediction at this position
        predicted_id = torch.argmax(position_logits).item()
        predicted_token = tokenizer.decode([predicted_id])
        predicted_logit = position_logits[predicted_id].item()
        
        print(f"\nPosition {pos}: '{token_str}' (ID: {token_id})")
        print(f"  Top prediction: '{predicted_token}' (ID: {predicted_id}, logit: {predicted_logit:.4f})")
        print(f"  Logit range: [{position_logits.min().item():.2f}, {position_logits.max().item():.2f}]")
        print(f"  Mean logit: {position_logits.mean().item():.4f}")

print("\n" + "="*50)
print("GENERATION TEST")
print("="*50)

# Generate continuation
input_ids = tokenizer(text, return_tensors="pt").input_ids
generated = model.generate(
    input_ids,
    max_new_tokens=20,
    do_sample=True,
    temperature=0.8,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"Input: '{text}'")
print(f"Generated: '{generated_text}'")

print("\n" + "="*50)
print("PERPLEXITY CHECK (on input)")
print("="*50)

with torch.no_grad():
    # Calculate loss/perplexity
    outputs = model(**encoded, labels=encoded["input_ids"])
    loss = outputs.loss
    perplexity = torch.exp(loss)
    
    print(f"Loss: {loss.item():.4f}")
    print(f"Perplexity: {perplexity.item():.4f}")

TOKENIZATION TEST
Input text: 'The boy went outside to fly his'
Token IDs: [1, 357, 652, 716, 1662, 192, 2024, 305]
Tokens: ['<s>', 'the', 'Ġboy', 'Ġwent', 'Ġoutside', 'Ġto', 'Ġfly', 'Ġhis']
Decoded: '<s>the boy went outside to fly his'

SPECIAL TOKENS CHECK
BOS: '<s>' (ID: 1)
EOS: '</s>' (ID: 2)
PAD: '<pad>' (ID: 0)
UNK: '<unk>' (ID: 3)
Vocab size: 8192

NEXT WORD PREDICTION
Top 10 next word predictions:
  1. ' head' (ID: 752, prob: 0.0219)
  2. ' hand' (ID: 625, prob: 0.0183)
  3. ' own' (ID: 881, prob: 0.0174)
  4. ' eyes' (ID: 1099, prob: 0.0165)
  5. ' father' (ID: 938, prob: 0.0124)
  6. ' hands' (ID: 1172, prob: 0.0116)
  7. ' mother' (ID: 826, prob: 0.0115)
  8. ' face' (ID: 1106, prob: 0.0097)
  9. ' feet' (ID: 1476, prob: 0.0076)
  10. ' way' (ID: 582, prob: 0.0072)

LOGITS FOR EACH INPUT TOKEN
Logits shape: torch.Size([1, 8, 8192])
(batch_size=1, sequence_length=8, vocab_size=8192)

Position 0: '<s>' (ID: 1)
  Top prediction: '*' (ID: 13, logit: 10.2652)
  Logit range: [-3.6

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import math

model_name = "znhoughton/opt-babylm-125m-seed42"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

def get_phrase_probability(text, model, tokenizer, verbose=False):
    """
    Calculate the probability of a phrase.
    Returns: (total_log_prob, perplexity, token_probs)
    """
    encoded = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**encoded, labels=encoded["input_ids"])
        
        # Get logits
        logits = outputs.logits  # [1, seq_len, vocab_size]
        
        # Calculate probability for each token
        token_log_probs = []
        token_probs_list = []
        
        for pos in range(logits.shape[1] - 1):  # -1 because last token has no next token
            # Logits at this position predict next token
            next_token_logits = logits[0, pos, :]
            next_token_id = encoded['input_ids'][0, pos + 1].item()
            
            # Get log probability of the actual next token
            log_probs = torch.log_softmax(next_token_logits, dim=-1)
            token_log_prob = log_probs[next_token_id].item()
            token_prob = math.exp(token_log_prob)
            
            token_log_probs.append(token_log_prob)
            token_probs_list.append({
                'token': tokenizer.decode([next_token_id]),
                'token_id': next_token_id,
                'log_prob': token_log_prob,
                'prob': token_prob
            })
            
            if verbose:
                print(f"  Token: '{tokenizer.decode([next_token_id])}' | "
                      f"Prob: {token_prob:.6f} | Log-prob: {token_log_prob:.4f}")
        
        # Total log probability (sum of log probs)
        total_log_prob = sum(token_log_probs)
        
        # Perplexity (from the model's loss)
        perplexity = math.exp(outputs.loss.item())
        
        return total_log_prob, perplexity, token_probs_list

# Compare different phrases
phrases = [
    "he likes bread and butter",
    "he likes butter and bread",
    "the cat slept on the mat",
    "it's not all black and white",
    "it's not all white and black",
    "I turned on the radios and televisions",
    "I turned on the televisions and radios"
]

print("="*70)
print("COMPARING PHRASE PROBABILITIES")
print("="*70)

results = []
for phrase in phrases:
    print(f"\nPhrase: '{phrase}'")
    log_prob, perplexity, token_probs = get_phrase_probability(phrase, model, tokenizer, verbose=True)
    
    # Convert log prob to actual probability
    # Note: this will be a very small number!
    probability = math.exp(log_prob)
    
    results.append({
        'phrase': phrase,
        'log_prob': log_prob,
        'probability': probability,
        'perplexity': perplexity,
        'avg_token_prob': math.exp(log_prob / len(token_probs)) if token_probs else 0
    })
    
    print(f"  → Total log-probability: {log_prob:.4f}")
    print(f"  → Probability: {probability:.2e}")
    print(f"  → Perplexity: {perplexity:.4f}")
    print(f"  → Avg token probability: {results[-1]['avg_token_prob']:.6f}")

# Rank by probability
print("\n" + "="*70)
print("RANKING (Best to Worst)")
print("="*70)

results.sort(key=lambda x: x['log_prob'], reverse=True)
for i, result in enumerate(results, 1):
    print(f"{i}. '{result['phrase']}'")
    print(f"   Log-prob: {result['log_prob']:.4f} | Perplexity: {result['perplexity']:.4f}")

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

COMPARING PHRASE PROBABILITIES

Phrase: 'he likes bread and butter'
  Token: 'he' | Prob: 0.005582 | Log-prob: -5.1883
  Token: ' likes' | Prob: 0.000898 | Log-prob: -7.0155
  Token: ' bread' | Prob: 0.000114 | Log-prob: -9.0836
  Token: ' and' | Prob: 0.127637 | Log-prob: -2.0586
  Token: ' butter' | Prob: 0.146677 | Log-prob: -1.9195
  → Total log-probability: -25.2655
  → Probability: 1.06e-11
  → Perplexity: 156.5062
  → Avg token probability: 0.006390

Phrase: 'he likes butter and bread'
  Token: 'he' | Prob: 0.005582 | Log-prob: -5.1883
  Token: ' likes' | Prob: 0.000898 | Log-prob: -7.0155
  Token: ' butter' | Prob: 0.000091 | Log-prob: -9.3060
  Token: ' and' | Prob: 0.026030 | Log-prob: -3.6485
  Token: ' bread' | Prob: 0.020235 | Log-prob: -3.9004
  → Total log-probability: -29.0586
  → Probability: 2.40e-13
  → Perplexity: 334.1950
  → Avg token probability: 0.002992

Phrase: 'the cat slept on the mat'
  Token: 'the' | Prob: 0.011653 | Log-prob: -4.4522
  Token: ' cat' | Pro

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import math

model_name = "znhoughton/opt-babylm-1.3b-seed42"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

def get_phrase_probability(text, model, tokenizer, verbose=False):
    """
    Calculate the probability of a phrase.
    Returns: (total_log_prob, perplexity, token_probs)
    """
    encoded = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**encoded, labels=encoded["input_ids"])
        
        # Get logits
        logits = outputs.logits  # [1, seq_len, vocab_size]
        
        # Calculate probability for each token
        token_log_probs = []
        token_probs_list = []
        
        for pos in range(logits.shape[1] - 1):  # -1 because last token has no next token
            # Logits at this position predict next token
            next_token_logits = logits[0, pos, :]
            next_token_id = encoded['input_ids'][0, pos + 1].item()
            
            # Get log probability of the actual next token
            log_probs = torch.log_softmax(next_token_logits, dim=-1)
            token_log_prob = log_probs[next_token_id].item()
            token_prob = math.exp(token_log_prob)
            
            token_log_probs.append(token_log_prob)
            token_probs_list.append({
                'token': tokenizer.decode([next_token_id]),
                'token_id': next_token_id,
                'log_prob': token_log_prob,
                'prob': token_prob
            })
            
            if verbose:
                print(f"  Token: '{tokenizer.decode([next_token_id])}' | "
                      f"Prob: {token_prob:.6f} | Log-prob: {token_log_prob:.4f}")
        
        # Total log probability (sum of log probs)
        total_log_prob = sum(token_log_probs)
        
        # Perplexity (from the model's loss)
        perplexity = math.exp(outputs.loss.item())
        
        return total_log_prob, perplexity, token_probs_list

# Compare different phrases
phrases = [
    "he likes bread and butter",
    "he likes butter and bread",
    "the cat slept on the mat",
    "it's not all black and white",
    "it's not all white and black",
    "I turned on the radios and televisions",
    "I turned on the televisions and radios"
]

print("="*70)
print("COMPARING PHRASE PROBABILITIES")
print("="*70)

results = []
for phrase in phrases:
    print(f"\nPhrase: '{phrase}'")
    log_prob, perplexity, token_probs = get_phrase_probability(phrase, model, tokenizer, verbose=True)
    
    # Convert log prob to actual probability
    # Note: this will be a very small number!
    probability = math.exp(log_prob)
    
    results.append({
        'phrase': phrase,
        'log_prob': log_prob,
        'probability': probability,
        'perplexity': perplexity,
        'avg_token_prob': math.exp(log_prob / len(token_probs)) if token_probs else 0
    })
    
    print(f"  → Total log-probability: {log_prob:.4f}")
    print(f"  → Probability: {probability:.2e}")
    print(f"  → Perplexity: {perplexity:.4f}")
    print(f"  → Avg token probability: {results[-1]['avg_token_prob']:.6f}")

# Rank by probability
print("\n" + "="*70)
print("RANKING (Best to Worst)")
print("="*70)

results.sort(key=lambda x: x['log_prob'], reverse=True)
for i, result in enumerate(results, 1):
    print(f"{i}. '{result['phrase']}'")
    print(f"   Log-prob: {result['log_prob']:.4f} | Perplexity: {result['perplexity']:.4f}")

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/4.91G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/388 [00:00<?, ?it/s]

COMPARING PHRASE PROBABILITIES

Phrase: 'he likes bread and butter'
  Token: 'he' | Prob: 0.003994 | Log-prob: -5.5230
  Token: ' likes' | Prob: 0.001919 | Log-prob: -6.2558
  Token: ' bread' | Prob: 0.000122 | Log-prob: -9.0086
  Token: ' and' | Prob: 0.217836 | Log-prob: -1.5240
  Token: ' butter' | Prob: 0.246719 | Log-prob: -1.3995
  → Total log-probability: -23.7109
  → Probability: 5.04e-11
  → Perplexity: 114.6844
  → Avg token probability: 0.008720

Phrase: 'he likes butter and bread'
  Token: 'he' | Prob: 0.003994 | Log-prob: -5.5230
  Token: ' likes' | Prob: 0.001919 | Log-prob: -6.2558
  Token: ' butter' | Prob: 0.000163 | Log-prob: -8.7243
  Token: ' and' | Prob: 0.059628 | Log-prob: -2.8196
  Token: ' bread' | Prob: 0.013843 | Log-prob: -4.2800
  → Total log-probability: -27.6027
  → Probability: 1.03e-12
  → Perplexity: 249.7692
  → Avg token probability: 0.004004

Phrase: 'the cat slept on the mat'
  Token: 'the' | Prob: 0.010527 | Log-prob: -4.5538
  Token: ' cat' | Pro