# Pythia 160m

In [42]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils
model = HookedTransformer.from_pretrained("EleutherAI/pythia-160m")
print(f"Loaded: Pythia 160m")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
Loaded: GPT-2 small
Layers: 12
Heads: 12
Hidden size: 768
Params: 162.3M


In [17]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is in

Cached 219 different activation points!


In [18]:
import torch
# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 in             5.605%
 the            5.600%
 being          5.279%
 a              4.515%
 located        3.621%
 under          3.160%
 known          2.357%
 at             1.989%
 not            1.692%
 now            1.424%


In [19]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  in the hands


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  the world with


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the middle of


In [20]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  the end of


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  great source of


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  two




  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  the color of


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  a great way


In [21]:
# Test variations of the same fact
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  the capital of


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  expected to be


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


In [22]:
# Run this between models
import gc
del model
gc.collect()
torch.mps.empty_cache()
print("Memory cleared")

Memory cleared


# Pythia 410m

In [24]:
model = HookedTransformer.from_pretrained("pythia-410m")
print(f"Loaded: Pythia 410m")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Loaded pretrained model pythia-410m into HookedTransformer
Loaded: Pythia 410m
Layers: 24
Heads: 16
Hidden size: 1024
Params: 405.3M


In [25]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is the

Cached 435 different activation points!


In [26]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 the            12.407%
 a              8.014%
 located        5.446%
 Paris          5.010%
 in             3.163%
 now            2.583%
 one            2.498%
 not            2.203%
 home           1.830%
 at             1.382%


In [4]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  the capital of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  which the right


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  Paris, France


In [27]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  the surface of


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  powerful force in


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  four.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a very important


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  a great companion


In [28]:
# Test variations of the same fact
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  the most expensive


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  The French government


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


In [29]:
# Run this between models
import gc
del model
gc.collect()
torch.mps.empty_cache()
print("Memory cleared")

Memory cleared


# Pythia 1b

In [43]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

# Load Pythia 1b
model = HookedTransformer.from_pretrained("pythia-1b")
print(f"Loaded: Pythia 1b")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Loaded pretrained model pythia-1b into HookedTransformer
Loaded: Pythia 1b
Layers: 16
Heads: 8
Hidden size: 2048
Params: 1011.7M


In [31]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is the

Cached 291 different activation points!


In [32]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 the            6.310%
 a              4.804%
 in             2.552%
 being          2.138%
 set            2.095%
 now            1.990%
 not            1.802%
 one            1.775%
 under          1.657%
 on             1.444%


In [22]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  which the French


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the midst of


In [33]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  a lower temperature


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  powerful source of


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  four.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  changing. The


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  the most popular


In [34]:
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  the most expensive


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  set to become


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  Paris




  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


In [35]:
# Run this between models
import gc
del model
gc.collect()
torch.mps.empty_cache()
print("Memory cleared")

Memory cleared


# Pythia 2.8b

In [36]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

# Load Pythia 2.8b
model = HookedTransformer.from_pretrained("pythia-2.8b")
print(f"Loaded: Pythia 2.8b")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Loaded pretrained model pythia-2.8b into HookedTransformer
Loaded: Pythia 2.8b
Layers: 32
Heads: 32
Hidden size: 2560
Params: 2774.9M


In [37]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is Paris

Cached 579 different activation points!


In [38]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 Paris          13.080%
 a              12.051%
 the            8.909%
 one            5.757%
 located        2.245%
 in             1.929%
 an             1.678%
 known          1.583%
 also           1.556%
 not            1.547%


In [28]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France and the


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  Europe, located


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the news again


In [39]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  -18.


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  powerful force,


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  four.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a very important


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  man's best


In [40]:
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  Paris, which


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  Paris




  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


In [41]:
# Run this between models
import gc
del model
gc.collect()
torch.mps.empty_cache()
print("Memory cleared")

Memory cleared
