# GPT2 Small

In [1]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

# Load GPT-2 small
model = HookedTransformer.from_pretrained("gpt2")
print(f"Loaded: GPT-2 small")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

`torch_dtype` is deprecated! Use `dtype` instead!


Loaded pretrained model gpt2 into HookedTransformer
Loaded: GPT-2 small
Layers: 12
Heads: 12
Hidden size: 768
Params: 163.0M


In [2]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is now

Cached 208 different activation points!


In [3]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 now            4.751%
 the            3.745%
 a              3.546%
 home           3.092%
 in             2.698%
 under          2.568%
 being          2.087%
 set            1.805%
 on             1.676%
 not            1.490%


In [4]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  now home to


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  which the French


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the heart of


In [5]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  the end of


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  bright, shining


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  one.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a color that


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  not allowed to


In [6]:
# Test variations of the same fact
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  now home to


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  now home to


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  France




  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


# GPT2 Medium

In [14]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

model = HookedTransformer.from_pretrained("gpt2-medium")
print(f"Loaded: GPT-2 medium")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Loaded pretrained model gpt2-medium into HookedTransformer
Loaded: GPT-2 medium
Layers: 24
Heads: 16
Hidden size: 1024
Params: 406.2M


In [15]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is the

Cached 412 different activation points!


In [16]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 the            6.010%
 a              4.835%
 now            2.653%
 Paris          2.443%
 in             2.327%
 not            2.009%
 one            1.704%
 France         1.647%
 known          1.596%
 home           1.508%


In [4]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  the capital of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  which the right


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  Paris, France


In [17]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  the bottom of


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  great source of


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  four.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a very important


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  not only smart


In [18]:
# Test variations of the same fact
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  on high alert


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  Paris




  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


# GPT2 Large

In [19]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

# Load GPT-2 large
model = HookedTransformer.from_pretrained("gpt2-large")
print(f"Loaded: GPT-2 large")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

Loaded pretrained model gpt2-large into HookedTransformer
Loaded: GPT-2 large
Layers: 36
Heads: 20
Hidden size: 1280
Params: 838.2M


In [20]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is a

Cached 616 different activation points!


In [21]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 a              7.579%
 the            5.794%
 in             4.402%
 home           3.371%
 now            2.564%
 one            2.222%
 to             2.158%
 on             2.145%
 facing         2.099%
 currently      2.051%


In [22]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France, and


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  which the French


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the midst of


In [23]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  -196°


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  powerful source of


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  four.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a very important


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  not the only


In [24]:
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  on high alert


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France


# GPT2 XL

In [25]:
from transformer_lens import HookedTransformer
import transformer_lens.utils as utils

# Load GPT-2 large
model = HookedTransformer.from_pretrained("gpt2-xl")
print(f"Loaded: GPT-2 xl")
print(f"Layers: {model.cfg.n_layers}")
print(f"Heads: {model.cfg.n_heads}")
print(f"Hidden size: {model.cfg.d_model}")
print(f"Params: {sum(p.numel() for p in model.parameters())/1e6:.1f}M")

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-xl into HookedTransformer
Loaded: GPT-2 xl
Layers: 48
Heads: 25
Hidden size: 1600
Params: 1637.8M


In [26]:
prompt = "The capital of France is"
output = model.generate(prompt, max_new_tokens=1, temperature=0)
print(f"Input: {prompt}")
print(f"Output: {output}")

# Cache internal states
logits, cache = model.run_with_cache(prompt)
print(f"\nCached {len(cache)} different activation points!")

  0%|          | 0/1 [00:00<?, ?it/s]

Input: The capital of France is
Output: The capital of France is a

Cached 820 different activation points!


In [27]:
import torch

# Look at top 10 predictions for next token
logits_for_next = logits[0, -1, :]  # Last position's predictions
probs = torch.softmax(logits_for_next, dim=0)
top_probs, top_tokens = torch.topk(probs, 10)

print("\nTop 10 predictions:")
for i in range(10):
    token_str = model.tokenizer.decode(top_tokens[i])
    print(f"{token_str:15} {top_probs[i]:.3%}")


Top 10 predictions:
 a              12.138%
 the            11.609%
 Paris          4.608%
 one            3.208%
 in             2.102%
 to             1.758%
 about          1.709%
 known          1.678%
 an             1.628%
 set            1.533%


In [28]:
# Test if it knows facts about France
test_prompts = [
    "The capital of France is",
    "Paris is the capital of",  
    "France is a country in",
    "The Eiffel Tower is in"
]

for prompt in test_prompts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital of France is       →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the capital of        →  France and the


  0%|          | 0/3 [00:00<?, ?it/s]

France is a country in         →  Europe, located


  0%|          | 0/3 [00:00<?, ?it/s]

The Eiffel Tower is in         →  the news again


In [29]:
simple_facts = [
    "Water freezes at",
    "The sun is a",
    "Two plus two equals",
    "The color of the sky is",
    "Dogs are",
]

for prompt in simple_facts:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:30} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

Water freezes at               →  a lower temperature


  0%|          | 0/3 [00:00<?, ?it/s]

The sun is a                   →  big, bright


  0%|          | 0/3 [00:00<?, ?it/s]

Two plus two equals            →  five.



  0%|          | 0/3 [00:00<?, ?it/s]

The color of the sky is        →  a reflection of


  0%|          | 0/3 [00:00<?, ?it/s]

Dogs are                       →  the most popular


In [30]:
variations = [
    "The capital city of France is",
    "France's capital is",
    "What is the capital of France?",
    "Capital of France:",
    "Paris is the",
]

for prompt in variations:
    output = model.generate(prompt, max_new_tokens=3, temperature=0)
    print(f"{prompt:35} → {output[len(prompt):]}")

  0%|          | 0/3 [00:00<?, ?it/s]

The capital city of France is       →  the most visited


  0%|          | 0/3 [00:00<?, ?it/s]

France's capital is                 →  a city of


  0%|          | 0/3 [00:00<?, ?it/s]

What is the capital of France?      → 

The


  0%|          | 0/3 [00:00<?, ?it/s]

Capital of France:                  →  Paris




  0%|          | 0/3 [00:00<?, ?it/s]

Paris is the                        →  capital of France
