In [2]:
from transformer_lens import HookedTransformer
import torch
import circuitsvis as cv
import einops
from IPython.display import display
import numpy as np
from pprint import pprint

In [None]:
model = HookedTransformer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")

In [None]:
model = model.to(torch.device('mps'))

In [None]:
logits, activations = model.run_with_cache("Hello World")

print(logits)
print(activations)

In [10]:
prompt = """
## Loading Models

HookedTransformer comes loaded with >40 open source GPT-style models. You can load any of them in with `HookedTransformer.from_pretrained(MODEL_NAME)`. Each model is loaded into the consistent HookedTransformer architecture, designed to be clean, consistent and interpretability-friendly.

For this demo notebook we'll look at GPT-2 Small, an 80M parameter model. To try the model the model out, let's find the loss on this paragraph!
"""

In [None]:
loss = model(prompt, return_type="loss")
print(loss)


In [None]:
model.to_str_tokens(prompt)

In [None]:
logits = model(prompt, return_type="logits")
prediction = logits.argmax(dim=-1).squeeze()[:-1]
print(prediction)

In [None]:
true_tokens = model.to_tokens(prompt).squeeze()[1:]
is_correct = prediction == true_tokens

print(f"Model accuracy: {is_correct.sum()}/{len(true_tokens)}")
print(f"Correct tokens: {model.to_str_tokens(prediction[is_correct])}")

In [None]:
model.blocks[0].attn.W_Q

In [None]:
text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
tokens = model.to_tokens(text)
logits, cache = model.run_with_cache(tokens, remove_batch_dim=True)

print(type(logits), type(cache))

In [19]:
attn_patterns_from_shorthand = cache["pattern", 0]
attn_patterns_from_full_name = cache["blocks.0.attn.hook_pattern"]

torch.testing.assert_close(attn_patterns_from_shorthand, attn_patterns_from_full_name)

In [None]:
# TODO: This part fails, is it because the attention mechanisms for Qwen (1.5B) and GPT-2 (small) are different?
layer0_pattern_from_cache = cache["pattern", 0]

q, k = cache["q", 0], cache["k", 0]
seq, nhead, headsize = q.shape
layer0_attn_scores = einops.einsum(q, k, "seqQ n h, seqK n h -> n seqQ seqK")
mask = torch.triu(torch.ones((seq, seq), dtype=torch.bool), diagonal=1).to(device)
layer0_attn_scores.masked_fill_(mask, -1e9)
layer0_pattern_from_q_and_k = (layer0_attn_scores / headsize**0.5).softmax(-1)

torch.testing.assert_close(layer0_pattern_from_cache, layer0_pattern_from_q_and_k)
print("Tests passed!")

In [None]:
print(type(cache))
attention_pattern = cache["pattern", 0]
print(attention_pattern.shape)
str_tokens = model.to_str_tokens(text)

print("Layer 0 Head Attention Patterns:")
display(
    cv.attention.attention_patterns(
        tokens=str_tokens,
        attention=attention_pattern,
        #attention_head_names=[f"L0H{i}" for i in range(12)],
    )
)

In [None]:
neuron_activations_for_all_layers = torch.stack([cache["post", layer] for layer in range(model.cfg.n_layers)], dim=1)
# shape = (seq_pos, layers, neurons)

cv.activations.text_neuron_activations(
    tokens=str_tokens,
    activations=neuron_activations_for_all_layers
)

In [33]:
def to_numpy(tensor):
    return tensor.detach().cpu().numpy()

In [None]:
neuron_activations_for_all_layers_rearranged = to_numpy(einops.rearrange(neuron_activations_for_all_layers, "seq layers neurons -> 1 layers seq neurons"))

cv.topk_tokens.topk_tokens(
    # Some weird indexing required here ¯\_(ツ)_/¯
    tokens=[str_tokens],
    activations=neuron_activations_for_all_layers_rearranged,
    max_k=7,
    first_dimension_name="Layer",
    third_dimension_name="Neuron",
    first_dimension_labels=list(range(12))
)

In [None]:
model.generate("What is 5+5?", max_new_tokens=20)

In [47]:
logits, cache = model.run_with_cache("What is 5+5 divided 16?", remove_batch_dim=True)

In [None]:
import torch
torch.backends.mps.is_available()

In [None]:
prompt = """Let's solve this step by step:

Question: What is Einstein's law of general relativity?

Let's break this down:
"""

model = model.to(torch.device('mps'))
result = model.generate(prompt, 
                        temperature=0.6, 
                        max_new_tokens=500,
                        top_p=0.95)
pprint(result)