In [62]:
from transformer_lens import HookedTransformer
import torch
import circuitsvis as cv
import einops
from IPython.display import display
import numpy as np
from pprint import pprint
from datasets import load_dataset
import random
from tqdm import tqdm

In [2]:
device = torch.device(
    "mps" if torch.backends.mps.is_available() else 
    "cuda" if torch.cuda.is_available() else 
    "cpu"
)

For this notebook to work, please add 

```python
deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
```

to the `OFFICIAL_MODEL_NAMES` list in the `loading_from_pretrained.py` file under the `TransformerLens` library after you've downloaded it locally.

In [3]:
model = HookedTransformer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
model = model.to(device)



Loaded pretrained model deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B into HookedTransformer
Moving model to device:  mps


In [7]:
logits, activations = model.run_with_cache("Hello World")

print(logits)
print(activations)

tensor([[[ 9.2315,  4.3642,  1.8801,  ..., -0.1068, -0.1066, -0.1090],
         [16.7911, 15.1479,  4.8276,  ..., -0.9735, -0.9744, -0.9750]]],
       device='mps:0', grad_fn=<ViewBackward0>)
ActivationCache with keys ['hook_embed', 'blocks.0.hook_resid_pre', 'blocks.0.ln1.hook_scale', 'blocks.0.ln1.hook_normalized', 'blocks.0.attn.hook_q', 'blocks.0.attn.hook_k', 'blocks.0.attn.hook_v', 'blocks.0.attn.hook_rot_q', 'blocks.0.attn.hook_rot_k', 'blocks.0.attn.hook_attn_scores', 'blocks.0.attn.hook_pattern', 'blocks.0.attn.hook_z', 'blocks.0.hook_attn_out', 'blocks.0.hook_resid_mid', 'blocks.0.ln2.hook_scale', 'blocks.0.ln2.hook_normalized', 'blocks.0.mlp.hook_pre', 'blocks.0.mlp.hook_pre_linear', 'blocks.0.mlp.hook_post', 'blocks.0.hook_mlp_out', 'blocks.0.hook_resid_post', 'blocks.1.hook_resid_pre', 'blocks.1.ln1.hook_scale', 'blocks.1.ln1.hook_normalized', 'blocks.1.attn.hook_q', 'blocks.1.attn.hook_k', 'blocks.1.attn.hook_v', 'blocks.1.attn.hook_rot_q', 'blocks.1.attn.hook_rot_k', 'bl

In [10]:
prompt = """
## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!
"""

In [None]:
loss = model(prompt, return_type="loss")
print(loss)


In [None]:
model.to_str_tokens(prompt)

In [None]:
logits = model(prompt, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
print(prediction)

In [None]:
true_tokens = model.to_tokens(prompt).squeeze()[1:]
is_correct = prediction == true_tokens

print(f"Model accuracy: {is_correct.sum()}/{len(true_tokens)}")
print(f"Correct tokens: {model.to_str_tokens(prediction[is_correct])}")

In [None]:
model.blocks[0].attn.W_Q

In [None]:
text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
tokens = model.to_tokens(text)
logits, cache = model.run_with_cache(tokens, remove_batch_dim=True)

print(type(logits), type(cache))

In [19]:
attn_patterns_from_shorthand = cache["pattern", 0]
attn_patterns_from_full_name = cache["blocks.0.attn.hook_pattern"]

torch.testing.assert_close(attn_patterns_from_shorthand, attn_patterns_from_full_name)

In [None]:
# TODO: This part fails, is it because the attention mechanisms for Qwen (1.5B) and GPT-2 (small) are different?
layer0_pattern_from_cache = cache["pattern", 0]

q, k = cache["q", 0], cache["k", 0]
seq, nhead, headsize = q.shape
layer0_attn_scores = einops.einsum(q, k, "seqQ n h, seqK n h -> n seqQ seqK")
mask = torch.triu(torch.ones((seq, seq), dtype=torch.bool), diagonal=1).to(device)
layer0_attn_scores.masked_fill_(mask, -1e9)
layer0_pattern_from_q_and_k = (layer0_attn_scores / headsize**0.5).softmax(-1)

torch.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
print("Tests passed!")

In [None]:
print(type(cache))
attention_pattern = cache["pattern", 0]
print(attention_pattern.shape)
str_tokens = model.to_str_tokens(text)

print("Layer 0 Head Attention Patterns:")
display(
    cv.attention.attention_patterns(
        tokens=str_tokens,
        attention=attention_pattern,
        #attention_head_names=[f"L0H{i}" for i in range(12)],
    )
)

In [None]:
neuron_activations_for_all_layers = torch.stack([cache["post", layer] for layer in range(model.cfg.n_layers)], dim=1)
# shape = (seq_pos, layers, neurons)

cv.activations.text_neuron_activations(
    tokens=str_tokens,
    activations=neuron_activations_for_all_layers
)

In [33]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy()

In [None]:
neuron_activations_for_all_layers_rearranged = to_numpy(einops.rearrange(neuron_activations_for_all_layers, "seq layers neurons -> 1 layers seq neurons"))

cv.topk_tokens.topk_tokens(
    # Some weird indexing required here ¯\_(ツ)_/¯
    tokens=[str_tokens],
    activations=neuron_activations_for_all_layers_rearranged,
    max_k=7,
    first_dimension_name="Layer",
    third_dimension_name="Neuron",
    first_dimension_labels=list(range(12))
)

In [None]:
model.generate("What is 5+5?", max_new_tokens=20)

In [47]:
logits, cache = model.run_with_cache("What is 5+5 divided 16?", remove_batch_dim=True)

In [8]:
prompt = """Let's solve this step by step:

Question: What is Einstein's law of general relativity?

Let's break this down:
"""

result = model.generate(prompt, 
                        temperature=0.6, 
                        max_new_tokens=500,
                        top_p=0.95)
pprint(result)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/500 [00:00<?, ?it/s]

("Let's solve this step by step:\n"
 '\n'
 "Question: What is Einstein's law of general relativity?\n"
 '\n'
 "Let's break this down:\n"
 "1. Einstein's law of general relativity.\n"
 '2. What is.\n'
 '\n'
 'So, the first part is "Einstein\'s law of general relativity." That seems '
 "correct because Einstein's theory is general relativity.\n"
 '\n'
 'The second part is "What is." So, putting it together, it\'s asking: What is '
 "Einstein's law of general relativity?\n"
 '\n'
 'Wait, but is that the standard way to phrase it? Because sometimes people '
 'might phrase it differently, like "What is the law of general relativity '
 'named after?"\n'
 '\n'
 'But in this case, the question is phrased as "What is Einstein\'s law of '
 'general relativity?" So, it\'s specifically mentioning Einstein\'s law, not '
 'just any law of general relativity.\n'
 '\n'
 "So, the answer would be that Einstein's law of general relativity is the "
 'theory of gravitation formulated by Albert Einstein, wh

In [4]:
# Load the MATH dataset
math_dataset = load_dataset("fdyrd/math")
print(f"Dataset structure: {math_dataset}")

# Examine the dataset structure
print(f"Available splits: {math_dataset.keys()}")
print(f"Number of examples in train: {len(math_dataset['train'])}")
print(f"Number of examples in test: {len(math_dataset['test'])}")

# Look at the first example to understand the format
print("\nExample from the dataset:")
example = math_dataset['train'][0]
print(f"Problem: {example['problem']}")
print(f"Level: {example['level']}")
print(f"Type: {example['type']}")
print(f"Solution: {example['solution']}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Dataset structure: DatasetDict({
    train: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 5000
    })
})
Available splits: dict_keys(['train', 'test'])
Number of examples in train: 7500
Number of examples in test: 5000

Example from the dataset:
Problem: Let \[f(x) = \left\{
\begin{array}{cl} ax+3, &\text{ if }x>2, \\
x-5 &\text{ if } -2 \le x \le 2, \\
2x-b &\text{ if } x <-2.
\end{array}
\right.\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).
Level: Level 5
Type: Algebra
Solution: For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. 

In [5]:
# Function to sample problems from the dataset
def sample_math_problems(dataset, n=5, level=None, problem_type=None):
    """
    Sample n problems from the dataset, optionally filtering by level or type.
    
    Args:
        dataset: The MATH dataset
        n: Number of problems to sample
        level: Optional filter for problem difficulty (e.g., "Level 1")
        problem_type: Optional filter for problem type (e.g., "Algebra")
    
    Returns:
        List of sampled problems
    """
    filtered_dataset = dataset['train']
    
    if level:
        filtered_dataset = [ex for ex in filtered_dataset if ex['level'] == level]
    
    if problem_type:
        filtered_dataset = [ex for ex in filtered_dataset if ex['type'] == problem_type]
    
    filtered_dataset = list(filtered_dataset)  # Convert to list to ensure it's a sequence
    return random.sample(filtered_dataset, min(n, len(filtered_dataset)))

In [50]:
sampled_problems = sample_math_problems(math_dataset, n=3, level="Level 3")
print("\nSampled problems for testing:")
for i, problem in enumerate(sampled_problems):
    print(f"\nProblem {i+1}:")
    print(f"Type: {problem['type']}, Level: {problem['level']}")
    print(f"Problem statement: {problem['problem']}")


Sampled problems for testing:

Problem 1:
Type: Geometry, Level: Level 3
Problem statement: A dump truck delivered sand to a construction site. The sand formed a conical pile with a diameter of $8$ feet and a height that was $75\%$ of the diameter. How many cubic feet of sand were in the pile? Express your answer in terms of $\pi$.

Problem 2:
Type: Prealgebra, Level: Level 3
Problem statement: If $x^2+x+4 = y - 4$ and $x = -7$, what is the value of $y$?

Problem 3:
Type: Prealgebra, Level: Level 3
Problem statement: It costs 5 cents to copy 3 pages. How many pages can you copy for $\$20$?


In [66]:
# Function to generate CoT using the model
def generate_cot_for_problem(
    model: HookedTransformer, 
    problem: str, 
    temperature: float = 0.4, 
    max_new_tokens: int = 1500, 
    top_p: float = 0.92
):
    """
    Generate a chain-of-thought solution for a given math problem.
    
    Args:
        model: The HookedTransformer model
        problem: The math problem text
        temperature: The temperature for the model
        max_new_tokens: The maximum number of tokens to generate
        top_p: The top-p value for the model
    Returns:
        The generated chain-of-thought solution
    """
    prompt = f"""Solve this math problem step by step. Put your final answer in \\boxed{{}}. Problem: {problem} Solution: \n<think>\n"""
    result = model.generate(prompt, 
                            temperature=temperature,
                            max_new_tokens=max_new_tokens,
                            top_p=top_p)
    return result

In [60]:
# Select a problem
problem_text = sampled_problems[0]['problem']

# Generate CoT
cot_solution = generate_cot_for_problem(model, problem_text)
print("\nGenerated Chain-of-Thought solution:")
print(cot_solution)

  0%|          | 0/1000 [00:00<?, ?it/s]


Generated Chain-of-Thought solution:
Solve this math problem step by step. Put your final answer in \boxed{}. Problem: A dump truck delivered sand to a construction site. The sand formed a conical pile with a diameter of $8$ feet and a height that was $75\%$ of the diameter. How many cubic feet of sand were in the pile? Express your answer in terms of $\pi$. Solution: 
<think>
Okay, so I have this problem where a dump truck delivered sand to a construction site, forming a conical pile. I need to find out how many cubic feet of sand were in the pile. The problem gives me the diameter of the cone and tells me that the height is 75% of the diameter. They also want the answer in terms of π, so I don't need to calculate a numerical value, just express it using π.

Alright, let me start by recalling the formula for the volume of a cone. I think it's something like one-third times the base area times the height. So, in symbols, that would be:

\[ V = \frac{1}{3} \pi r^2 h \]

Where \( V \) i

In [69]:
# Function to batch process multiple problems
def batch_generate_cot(
    model, 
    problems, 
    temperature: float = 0.4, 
    max_new_tokens: int = 1500, 
    top_p: float = 0.92, 
    save_every: int = 5,
    save_path: str = None
):
    """
    Generate CoT solutions for a batch of problems and optionally save results.
    
    Args:
        model: The HookedTransformer model
        problems: List of problem dictionaries
        temperature: The temperature for the model
        max_new_tokens: The maximum number of tokens to generate
        top_p: The top-p value for the model
        save_path: Optional path to save results
    
    Returns:
        List of dictionaries containing problems and their CoT solutions
    """
    results = []
    
    for i, problem in tqdm(enumerate(problems), total=len(problems), desc="Generating CoT solutions"):
        problem_text = problem['problem']
        solution = generate_cot_for_problem(
            model=model, 
            problem=problem_text, 
            temperature=temperature, 
            max_new_tokens=max_new_tokens, 
            top_p=top_p
        )
        
        results.append({
            "problem_id": i,
            "problem_text": problem_text,
            "problem_type": problem['type'],
            "problem_level": problem['level'],
            "ground_truth_solution": problem['solution'],
            "generated_cot": solution
        })
        
        if i % save_every == 0 and save_path:
            print(f"Saving results to {save_path}...")
            import json
            with open(save_path, 'w') as f:
                json.dump(results, f, indent=2)
    
    if save_path:
        import json
        with open(save_path, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"Results saved to {save_path}")
    
    return results

In [70]:
# Sample a small set of problems for testing
test_problems = sample_math_problems(math_dataset, n=1000)

temperature = 0.4
max_new_tokens = 1500
top_p = 0.92

# Generate CoT solutions for the test problems
cot_results = batch_generate_cot(
    model, 
    test_problems, 
    temperature=temperature, 
    max_new_tokens=max_new_tokens, 
    top_p=top_p, 
    save_path=f"math_cot_results_t={temperature}_mnt={max_new_tokens}_tp={top_p}.json",
    save_every=10
)

Generating CoT solutions:   0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   0%|          | 1/1000 [02:41<44:49:37, 161.54s/it]

Saving results to math_cot_results_t=0.4_mnt=1500_tp=0.92.json...


  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   0%|          | 2/1000 [06:11<52:38:29, 189.89s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   0%|          | 3/1000 [08:02<42:35:25, 153.79s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   0%|          | 4/1000 [10:42<43:13:46, 156.25s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   0%|          | 5/1000 [13:28<44:08:57, 159.74s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 6/1000 [15:40<41:34:52, 150.60s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 7/1000 [17:46<39:14:47, 142.28s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 8/1000 [18:52<32:35:43, 118.29s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 9/1000 [20:22<30:04:02, 109.23s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 10/1000 [23:05<34:38:05, 125.95s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 11/1000 [25:53<38:04:52, 138.62s/it]

Saving results to math_cot_results_t=0.4_mnt=1500_tp=0.92.json...


  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|          | 12/1000 [28:39<40:23:35, 147.18s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|▏         | 13/1000 [29:49<33:56:22, 123.79s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   1%|▏         | 14/1000 [32:36<37:25:36, 136.65s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 15/1000 [35:20<39:42:30, 145.13s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 16/1000 [38:06<41:20:27, 151.25s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 17/1000 [41:13<44:15:16, 162.07s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 18/1000 [43:38<42:47:56, 156.90s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 19/1000 [45:27<38:52:08, 142.64s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 20/1000 [48:14<40:48:04, 149.88s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 21/1000 [51:31<44:36:56, 164.06s/it]

Saving results to math_cot_results_t=0.4_mnt=1500_tp=0.92.json...


  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 22/1000 [54:33<46:01:44, 169.43s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 23/1000 [55:53<38:38:49, 142.40s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▏         | 24/1000 [57:38<35:34:32, 131.22s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   2%|▎         | 25/1000 [1:00:47<40:15:32, 148.65s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 26/1000 [1:03:58<43:36:57, 161.21s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 27/1000 [1:07:15<46:29:40, 172.03s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 28/1000 [1:09:55<45:28:18, 168.41s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 29/1000 [1:11:36<39:59:10, 148.25s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 30/1000 [1:14:18<41:02:28, 152.32s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 31/1000 [1:16:31<39:29:29, 146.72s/it]

Saving results to math_cot_results_t=0.4_mnt=1500_tp=0.92.json...


  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 32/1000 [1:18:21<36:27:30, 135.59s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 33/1000 [1:21:49<42:15:32, 157.32s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   3%|▎         | 34/1000 [1:25:09<45:38:06, 170.07s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▎         | 35/1000 [1:28:09<46:24:05, 173.10s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▎         | 36/1000 [1:32:00<51:00:21, 190.48s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▎         | 37/1000 [1:34:58<49:55:52, 186.66s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 38/1000 [1:37:17<46:01:51, 172.26s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 39/1000 [1:40:04<45:37:04, 170.89s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 40/1000 [1:40:54<35:54:02, 134.63s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 41/1000 [1:43:40<38:19:49, 143.89s/it]

Saving results to math_cot_results_t=0.4_mnt=1500_tp=0.92.json...


  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 42/1000 [1:45:04<33:33:31, 126.11s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 43/1000 [1:47:00<32:43:08, 123.08s/it]

  0%|          | 0/1500 [00:00<?, ?it/s]

Generating CoT solutions:   4%|▍         | 43/1000 [1:50:08<40:51:12, 153.68s/it]


RuntimeError: Sizes of tensors must match except in dimension 3. Expected size 0 but got size 1 for tensor number 1 in the list.