In [21]:
import torch
from transformer_lens import HookedTransformer
from mechinterp import Interpreter
from mechinterp.utils import PatchscopesTargetPrompts

In [2]:
model = HookedTransformer.from_pretrained("google/gemma-2-2b")
interp = Interpreter(model)

Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00,  5.86it/s]


Loaded pretrained model google/gemma-2-2b into HookedTransformer


### Replicating the Jeff Bezos example from the paper

In [3]:
jeff = model.run_with_cache("Amazon's former CEO attended the Oscars")[1]["blocks.19.hook_resid_pre"][0, 5]

In [4]:
interp.logit_lens(jeff)

Logit Lens Output:
	- Topk tokens: [' and', ',', ' Amazon', 'Amazon', ' CEO', ' amazon', ' has', ' السابق', ' fondateur', ' chief', ' turned', ' in', ' famously', ' company', ' founder', ' who', ' emeritus', ' says', ' now', ' business']

	- Bottomk tokens: ['########.', ' AssemblyCulture', 'findpost', '<bos>', ' @"/', 'styleType', '+:+', ' typelib', ")':", 'uxxxx', ' CreateTagHelper', 'Datuak', 'MigrationBuilder', ' Reverso', 'ANSA', ' للمعارف', 'UnsafeEnabled', ' ModelExpression', 'esgue', 'ỡng']

In [5]:
interp.patchscopes(jeff, n=5)

Patchscopes Explanation: '<bos>Syria: Country in the Middle East
Leonardo DiCaprio: American actor
Samsung: South Korean multinational major appliance and consumer electronics corporation
X: Former CEO of Amazon.'

In [6]:
interp.patchscopes(jeff, n=5)

Patchscopes Explanation: '<bos>Syria: Country in the Middle East
Leonardo DiCaprio: American actor
Samsung: South Korean multinational major appliance and consumer electronics corporation
X: The head of a company'

In [16]:
interp.patchscopes(jeff, PatchscopesTargetPrompts.IDENTITY_FEW_SHOT, n=2, temperature=1)

Patchscopes Explanation: '<bos>cat -> cat
1135 -> 1135
hello -> hello
X-> amazon
'

In [8]:
interp.patchscopes(jeff, "The birth name of {} is")

Patchscopes Explanation: '<bos>The birth name of X is Jeff Bezos, and he is the founder of Amazon.com. He is the richest man in the world and has a net worth of $11'

### Patchscopes With Two Vectors at Once

In [71]:
steve = model.run_with_cache("Apple's former CEO attended the Oscars")[1]["blocks.21.hook_resid_pre"][0, 5]
elon = model.run_with_cache("Tesla's CEO attended the Oscars")[1]["blocks.21.hook_resid_pre"][0, 4]

In [75]:
interp.patchscopes(torch.stack([2*steve, 0.5*elon]), "The companies founded by {}{} are:\n1. ", n=100)

Patchscopes Explanation: '<bos>The companies founded by XX are:
1. <strong>Tesla</strong>
2. <strong>SpaceX</strong>
3. <strong>SolarCity</strong>
4. <strong>Neuralink</strong>
5. <strong>The Boring Company</strong>

The companies founded by the co-founder are:
1. <strong>Apple</strong>
2. <strong>NeXT</strong>
3. <strong>AOL</strong>
4. <strong>Pixar</strong>
5. <strong>Cupertino</strong>

The companies founded by the co-founder are:
1.'