In [None]:
from IPython import get_ipython

ipython = get_ipython()
# Code to automatically update the HookedTransformer code as its edited without restarting the kernel
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


In [None]:
# Import stuff
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
from fancy_einsum import einsum
import tqdm.notebook as tqdm
import random
from pathlib import Path
import plotly.express as px
from torch.utils.data import DataLoader

from jaxtyping import Float, Int
from typing import List, Union, Optional
from functools import partial
import copy

import itertools
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
import dataclasses
import datasets
from IPython.display import HTML

In [None]:
import pysvelte

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [None]:
def imshow(tensor, renderer=None, **kwargs):
    px.imshow(utils.to_numpy(tensor), color_continuous_midpoint=0.0, color_continuous_scale="RdBu", **kwargs).show(renderer)

def line(tensor, renderer=None, **kwargs):
    px.line(y=utils.to_numpy(tensor), **kwargs).show(renderer)

def scatter(x, y, xaxis="", yaxis="", caxis="", renderer=None, **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(y=y, x=x, labels={"x":xaxis, "y":yaxis, "color":caxis}, **kwargs).show(renderer)

### Indirect Object Identification


In [None]:
model = HookedTransformer.from_pretrained(
    "gpt2-small",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=True
)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-small into HookedTransformer


In [None]:
example_prompt = "After John and Mary went to the store, John gave a bottle of milk to"

In [None]:
example_answer = " Mary"

In [None]:
utils.test_prompt(example_prompt, " John", model, prepend_bos=True)

Tokenized prompt: ['<|endoftext|>', 'After', ' John', ' and', ' Mary', ' went', ' to', ' the', ' store', ',', ' John', ' gave', ' a', ' bottle', ' of', ' milk', ' to']
Tokenized answer: [' John']


Top 0th token. Logit: 18.09 Prob: 70.07% Token: | Mary|
Top 1th token. Logit: 15.38 Prob:  4.67% Token: | the|
Top 2th token. Logit: 15.35 Prob:  4.54% Token: | John|
Top 3th token. Logit: 15.25 Prob:  4.11% Token: | them|
Top 4th token. Logit: 14.84 Prob:  2.73% Token: | his|
Top 5th token. Logit: 14.06 Prob:  1.24% Token: | her|
Top 6th token. Logit: 13.54 Prob:  0.74% Token: | a|
Top 7th token. Logit: 13.52 Prob:  0.73% Token: | their|
Top 8th token. Logit: 13.13 Prob:  0.49% Token: | Jesus|
Top 9th token. Logit: 12.97 Prob:  0.42% Token: | him|


In [None]:
prompt_format = [
    "When John and Mary went to the shops,{} gave the bag to",
    "When Tom and James went to the park,{} gave the ball to",
    "When Dan and Sid went to the shops,{} gave an apple to",
    "After Martin and Amy went to the park,{} gave a drink to",
]
names = [
    (" Mary", " John"),
    (" Tom", " James"),
    (" Dan", " Sid"),
    (" Martin", " Amy"),
]
# List of prompts
prompts = []
# List of answers, in the format (correct, incorrect)
answers = []
# List of the token (ie an integer) corresponding to each answer, in the format (correct_token, incorrect_token)
answer_tokens = []
for i in range(len(prompt_format)):
    for j in range(2):
        answers.append((names[i][j], names[i][1 - j]))
        answer_tokens.append(
            (
                model.to_single_token(answers[-1][0]),
                model.to_single_token(answers[-1][1]),
            )
        )
        # Insert the *incorrect* answer to the prompt, making the correct answer the indirect object.
        prompts.append(prompt_format[i].format(answers[-1][1]))
answer_tokens = torch.tensor(answer_tokens)
print(prompts)
print(answers)

['When John and Mary went to the shops, John gave the bag to', 'When John and Mary went to the shops, Mary gave the bag to', 'When Tom and James went to the park, James gave the ball to', 'When Tom and James went to the park, Tom gave the ball to', 'When Dan and Sid went to the shops, Sid gave an apple to', 'When Dan and Sid went to the shops, Dan gave an apple to', 'After Martin and Amy went to the park, Amy gave a drink to', 'After Martin and Amy went to the park, Martin gave a drink to']
[(' Mary', ' John'), (' John', ' Mary'), (' Tom', ' James'), (' James', ' Tom'), (' Dan', ' Sid'), (' Sid', ' Dan'), (' Martin', ' Amy'), (' Amy', ' Martin')]


In [None]:
tokens = model.to_tokens(prompts, prepend_bos=True)

In [None]:
original_logits, cache = model.run_with_cache(tokens)

##### Example 1

In [None]:
import torch
import torch.nn.functional as F

In [None]:
# Set the random seed for reproducibility
torch.manual_seed(42)

# Random Q, K, and V vectors for 5 tokens with dimension 3
q = torch.randn(5, 3)
k = torch.randn(5, 3)
v = torch.randn(5, 3)

In [None]:
# Calculate the attention scores by taking the dot product of Q and K
attn_scores = torch.matmul(q, k.t())  # Shape: (5, 5)

In [None]:
# Apply mask to the attention scores to ignore positions after the current one
mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()
attn_scores.masked_fill_(mask, float('-inf'))

tensor([[ 0.6844,    -inf,    -inf,    -inf,    -inf],
        [-1.4162,  0.4144,    -inf,    -inf,    -inf],
        [ 1.6491, -0.5878,  2.0916,    -inf,    -inf],
        [ 1.6247,  0.4884,  0.6516,  0.5284,    -inf],
        [-1.9787, -0.9195,  0.5297, -0.2993, -0.7008]])

In [None]:
# Calculate the attention pattern by applying row-wise softmax
attn_pattern = F.softmax(attn_scores, dim=-1)  # Shape: (5, 5)

In [None]:
attn_pattern.shape

torch.Size([5, 5])

In [None]:
attn_pattern[1].shape

torch.Size([5])

In [None]:
attn_pattern.shape

torch.Size([5, 5])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
v[1]

tensor([ 0.8640, -1.0157, -0.8887])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
attn_pattern[1] @ v[:, 1]

tensor(-0.7746)

In [None]:
import torch
import torch.nn.functional as F

In [None]:
# Set the random seed for reproducibility
torch.manual_seed(42)

# Random Q, K, and V vectors for 5 tokens with dimension 3
q = torch.randn(5, 3)
k = torch.randn(5, 3)
v = torch.randn(5, 3)

In [None]:
# Calculate the attention scores by taking the dot product of Q and K
attn_scores = torch.matmul(q, k.t())  # Shape: (5, 5)

In [None]:
# Apply mask to the attention scores to ignore positions after the current one
mask = torch.triu(torch.ones(5, 5), diagonal=1).bool()
attn_scores.masked_fill_(mask, float('-inf'))

tensor([[ 0.6844,    -inf,    -inf,    -inf,    -inf],
        [-1.4162,  0.4144,    -inf,    -inf,    -inf],
        [ 1.6491, -0.5878,  2.0916,    -inf,    -inf],
        [ 1.6247,  0.4884,  0.6516,  0.5284,    -inf],
        [-1.9787, -0.9195,  0.5297, -0.2993, -0.7008]])

In [None]:
# Calculate the attention pattern by applying row-wise softmax
attn_pattern = F.softmax(attn_scores, dim=-1)  # Shape: (5, 5)

In [None]:
attn_pattern.shape

torch.Size([5, 5])

In [None]:
attn_pattern[1].shape

torch.Size([5])

In [None]:
attn_pattern.shape

torch.Size([5, 5])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
v[1]

tensor([ 0.8640, -1.0157, -0.8887])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
v.shape

torch.Size([5, 3])

In [None]:
attn_pattern[1] @ v[:, 1]

tensor(-0.7746)

In [None]:
# Compute the mixed value by multiplying the attention pattern with V
mixed_value = torch.matmul(attn_pattern, v)  # Shape: (5, 3)

print("Q:")
print(q)
print("K:")
print(k)
print("V:")
print(v)
print("Attention Scores:")
print(attn_scores)
print("Attention Pattern:")
print(attn_pattern)
print("Mixed Value:")
print(mixed_value)

Q:
tensor([[ 0.3367,  0.1288,  0.2345],
        [ 0.2303, -1.1229, -0.1863],
        [ 2.2082, -0.6380,  0.4617],
        [ 0.2674,  0.5349,  0.8094],
        [ 1.1103, -1.6898, -0.9890]])
K:
tensor([[ 0.9580,  1.3221,  0.8172],
        [-0.7658, -0.7506,  1.3525],
        [ 0.6863, -0.3278,  0.7950],
        [ 0.2815,  0.0562,  0.5227],
        [-0.2384, -0.0499,  0.5263]])
V:
tensor([[-0.0085,  0.7291,  0.1331],
        [ 0.8640, -1.0157, -0.8887],
        [ 0.1498, -0.2089, -0.3870],
        [ 0.9912,  0.4679, -0.2049],
        [-0.7409,  0.3618,  1.9199]])
Attention Scores:
tensor([[ 0.6844,    -inf,    -inf,    -inf,    -inf],
        [-1.4162,  0.4144,    -inf,    -inf,    -inf],
        [ 1.6491, -0.5878,  2.0916,    -inf,    -inf],
        [ 1.6247,  0.4884,  0.6516,  0.5284,    -inf],
        [-1.9787, -0.9195,  0.5297, -0.2993, -0.7008]])
Attention Pattern:
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1382, 0.8618, 0.0000, 0.0000, 0.0000],
        [0.3755, 0.0

##### Example 2

In [None]:
from transformer_lens.utils import get_act_name

In [None]:
act_name = get_act_name("k6")

In [None]:
type(cache)

transformer_lens.ActivationCache.ActivationCache

In [None]:
act_name

'blocks.6.attn.hook_k'

Extract the activation of `act_name`

In [None]:
activations = cache[act_name]

In [None]:
activations.shape

torch.Size([8, 15, 12, 64])

##### Example 3

In [None]:
type(cache)

transformer_lens.ActivationCache.ActivationCache

Extract all intermediate activations of the attention in the second layer

In [None]:
activations = cache["attn", 1]

In [None]:
activations.shape

torch.Size([8, 12, 15, 15])

`batch_size` = 8, `seq_len` = 12

Extract the activations of the third head in the second sentence

In [None]:
act_head = activations[1, 2]

In [None]:
act_head.shape

torch.Size([15, 15])

In [5]:
args = class C()

SyntaxError: invalid syntax (1947284924.py, line 1)

In [4]:
args.x = 1

AttributeError: 'object' object has no attribute 'x'