# Setup

In [1]:
%%capture
# %pip install git+https://github.com/redwoodresearch/Easy-Transformer.git
%pip install git+https://github.com/wlg1/Easy-Transformer.git
%pip install einops datasets transformers fancy_einsum

In [2]:
from copy import deepcopy
import torch

# assert torch.cuda.device_count() == 1
from tqdm import tqdm
import pandas as pd
import torch
import torch as t
from easy_transformer.EasyTransformer import (
    EasyTransformer,
)
from time import ctime
from functools import partial

import numpy as np
from tqdm import tqdm
import pandas as pd

from easy_transformer.experiments import (
    ExperimentMetric,
    AblationConfig,
    EasyAblation,
    EasyPatching,
    PatchingConfig,
)
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import random
import einops
from IPython import get_ipython
from copy import deepcopy
from easy_transformer.ioi_dataset import (
    IOIDataset,
)
from easy_transformer.ioi_utils import (
    path_patching,
    max_2d,
    CLASS_COLORS,
    show_pp,
    show_attention_patterns,
    scatter_attention_and_contribution,
)
from random import randint as ri
from easy_transformer.ioi_circuit_extraction import (
    do_circuit_extraction,
    get_heads_circuit,
    CIRCUIT,
)
from easy_transformer.ioi_utils import logit_diff, probs
from easy_transformer.ioi_utils import get_top_tokens_and_probs as g

ipython = get_ipython()
if ipython is not None:
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

 Initialise model (use larger N or fewer templates for no warnings about in-template ablation)

In [3]:
# model = EasyTransformer.from_pretrained("gpt2").cuda()
model = EasyTransformer.from_pretrained("gpt2")
model.set_use_attn_result(True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Moving model to device:  cuda
Finished loading pretrained model gpt2 into EasyTransformer!


# Dataset of Prompts

https://github.com/redwoodresearch/Easy-Transformer/blob/main/easy_transformer/ioi_dataset.py

See:
class IOIDataset:
...

elif isinstance(prompt_type, list):
    self.templates = prompt_type

prompt type is required, and this is how it uses custom

In [None]:
N=10
custom_templates = [
    " The human is [A]. The animal is [B]. The human is",
]
# IOIDataset imported from lib
dataset = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)



In [None]:
dataset.sentences[:3]

[' The human is John. The animal is Amy. The human is',
 ' The human is Jeffrey. The animal is Richard. The human is',
 ' The human is Sara. The animal is Christine. The human is']

In [None]:
model_logit_diff = logit_diff(model, dataset)
model_io_probs = probs(model, dataset)
print(
    f"The model gets average logit difference {model_logit_diff.item()} over {N} examples"
)
print(f"The model gets average IO probs {model_io_probs.item()} over {N} examples")

The model gets average logit difference 1.0955629348754883 over 10 examples
The model gets average IO probs 1.6234673239523545e-05 over 10 examples


# Copy score

In [None]:
cache = {}
model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
model(dataset.toks.long())
z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

In [None]:
z_0.shape

torch.Size([10, 13, 768])

In [None]:
layer, head = 9, 9
model.blocks[layer].attn.W_V[head].shape

torch.Size([768, 64])

In [None]:
model.blocks[layer].attn.W_O[head].shape

torch.Size([64, 768])

In [None]:
model.blocks[layer].attn.W_O[0]

tensor([[-0.1318, -0.1279,  0.0167,  ..., -0.0699, -0.0487, -0.1682],
        [ 0.2072, -0.0121,  0.1069,  ...,  0.0012,  0.1675,  0.0476],
        [-0.0355,  0.1091,  0.1889,  ..., -0.0576,  0.1558,  0.0071],
        ...,
        [-0.0205, -0.0530, -0.0531,  ..., -0.0044,  0.2247,  0.0134],
        [ 0.0336, -0.1307, -0.0625,  ...,  0.1310,  0.2081, -0.1060],
        [ 0.0877, -0.1131, -0.0253,  ..., -0.0442,  0.0106,  0.0341]],
       grad_fn=<SelectBackward0>)

In [None]:
model.blocks[layer].attn.W_O[1]

tensor([[ 0.1124,  0.1710,  0.0596,  ..., -0.1461,  0.0331, -0.1071],
        [-0.0867, -0.1263,  0.0233,  ..., -0.1319,  0.2095,  0.1257],
        [ 0.0527,  0.1728, -0.0175,  ...,  0.3649, -0.0555,  0.1738],
        ...,
        [ 0.0726, -0.1843, -0.1738,  ...,  0.1236,  0.3114, -0.2309],
        [ 0.1949, -0.0181, -0.0466,  ..., -0.2510, -0.1156,  0.0460],
        [ 0.1459,  0.1847,  0.0088,  ...,  0.1253,  0.0934, -0.2282]],
       grad_fn=<SelectBackward0>)

In [None]:
dir(model.blocks[9].attn)

In [None]:
model.blocks[9].mlp

In [None]:
dataset.toks.long().shape

torch.Size([10, 13])



---



In [None]:
def check_copy_circuit(model, layer, head, ioi_dataset, verbose=False, neg=False):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["IO", "S", "S2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            if "S" in word:
                name = "S"
            else:
                name = word
            if " " + prompt[name] in pred_tokens:
                n_right += 1
            else:
                if verbose:
                    print("-------")
                    print("Seq: " + ioi_dataset.sentences[seq_idx])
                    print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
                    print(
                        " ".join(
                            [
                                f"({i+1}):{model.tokenizer.decode(token)}"
                                for i, token in enumerate(
                                    torch.topk(
                                        logits[
                                            seq_idx, ioi_dataset.word_idx[word][seq_idx]
                                        ],
                                        k,
                                    ).indices
                                )
                            ]
                        )
                    )
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    # print(
    #     f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    # )
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : {pred_tokens}%"
    )
    return percent_right, pred_tokens

In [None]:
check_copy_circuit(model, 9, 9, dataset)

Copy circuit for head 9.9 (sign=1) : [' Kristen', 'Krist', ' Stewart', ' Krist', ' Stefan']%


(100.0, [' Kristen', 'Krist', ' Stewart', ' Krist', ' Stefan'])

In [None]:
N=2
custom_templates = [
    "[A] is a teacher. [B] is a student. The child is [B]. [C] is a teacher. [D] is a student. The child is",
]
dataset2 = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)



ValueError: ignored

```
def get_name_idxs(prompts, tokenizer, idx_types=["IO", "S", "S2"], prepend_bos=False):
```

ISSUE: IOIdataset only allows 3 specific types of subjects

## Rewrite copy scores and dataset

Given that the ioi_dataset class just stores things to be called by copy scores and path patching, you can not use that class and use your own class, as ioi_dataset has certain requirements

First, find what's being called in copy scores, so they can be emulated in your new object

### Explore IOIdataset's vars

In [None]:
N=2
custom_templates = [
    "Then, [A], [B] and [C] went to the [PLACE]. [B] and [C] gave a [OBJECT] to [A]",
]
dataset = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)



In [None]:
vars(dataset).keys()

dict_keys(['prompt_type', 'templates', 'tokenizer', 'prefixes', 'ioi_prompts', 'groups', 'sentences', 'templates_by_prompt', 'toks', 'word_idx', 'prepend_bos', 'sem_tok_idx', 'N', 'max_len', 'io_tokenIDs', 's_tokenIDs', 'tokenized_prompts'])

In [None]:
dataset.toks.long().shape

torch.Size([2, 24])

In [None]:
#for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
dataset.ioi_prompts

[{'[PLACE]': 'restaurant',
  '[OBJECT]': 'ring',
  'text': 'Then, Sarah, Jeffrey and [C] went to the restaurant. Jeffrey and [C] gave a ring to Sarah',
  'IO': 'Sarah',
  'S': 'Jeffrey',
  'TEMPLATE_IDX': 0},
 {'[PLACE]': 'office',
  '[OBJECT]': 'computer',
  'text': 'Then, Sean, Jessica and [C] went to the office. Jessica and [C] gave a computer to Sean',
  'IO': 'Sean',
  'S': 'Jessica',
  'TEMPLATE_IDX': 0}]

In [None]:
dataset.word_idx['S'][0]  # token position includes punctuation

tensor(4)

In [None]:
dataset.word_idx['S'][0].item()

4

In [None]:
dataset.word_idx

{'IO': tensor([2, 2]),
 'IO-1': tensor([1, 1]),
 'IO+1': tensor([3, 3]),
 'S': tensor([4, 4]),
 'S-1': tensor([3, 3]),
 'S+1': tensor([5, 5]),
 'S2': tensor([14, 14]),
 'end': tensor([22, 22]),
 'starts': tensor([0, 0]),
 'punct': tensor([13, 13])}

In [None]:
dataset.toks.long().shape[1] - 2

22

So you don't need place or object in each prompt. The S and IO are replaced by S1, S2, etc. and the most recent (S4) should be the answer. We aim to find which heads copy scores include S4; these are name movers that move S4 to the output (contributing much to how it becomes the top logit by the final layer).

In [None]:
{"text": "Alice is a teacher. Bob is a student. The child is [B]. [C] is a teacher. [D] is a student. The child is",
     "S1": "Alice", "S2": "Bob", "S3": "Carol", "S4": "Dave"},]

In [None]:
tokens = model.tokenizer.tokenize("teacher")
len(tokens)

2

In [None]:
string = "Alice is a teacher. Bob is a student. The child is Bob. Carol is a teacher. David is a student. The child is"
target_token = "ĠBob"

tokens = model.tokenizer.tokenize(string)

if target_token in tokens:
    target_index = tokens.index(target_token)
    print(target_index)
else:
    print("Target token not found in the string.")


5


In [None]:
tokens

['Alice',
 'Ġis',
 'Ġa',
 'Ġteacher',
 '.',
 'ĠBob',
 'Ġis',
 'Ġa',
 'Ġstudent',
 '.',
 'ĠThe',
 'Ġchild',
 'Ġis',
 'ĠBob',
 '.',
 'ĠCarol',
 'Ġis',
 'Ġa',
 'Ġteacher',
 '.',
 'ĠDavid',
 'Ġis',
 'Ġa',
 'Ġstudent',
 '.',
 'ĠThe',
 'Ġchild',
 'Ġis']

### New Dataset Class

In [None]:
# test word_idx fn

ioi_prompts = [{'text': 'Alice is a teacher. Bob is a student. The child is Bob. Carol is a teacher. David is a student. The child is',
  'S1': 'Alice',
  'S2': 'Bob',
  'S3': 'Carol',
  'S4': 'David'}]

N=1
word_idx = {"S1": torch.tensor([1]*N)}
for subj in ["S2", "S3", "S4"]:
    subj_lst = []
    for prompt in ioi_prompts:
        input_text = prompt["text"]
        target_token = "Ġ" + prompt[subj]

        tokens = model.tokenizer.tokenize(input_text)
        target_index = tokens.index(target_token)
        subj_lst.append(target_index)
    word_idx[subj] = torch.tensor(subj_lst)
word_idx

{'S1': tensor([1]), 'S2': tensor([5]), 'S3': tensor([15]), 'S4': tensor([20])}

In [15]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {"S1": torch.tensor([1]*N)}
        for subj in ["S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                target_token = "Ġ" + prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

In [None]:
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
names = ["Alice", "Bob", "Carol", "David"]

ioi_prompts_elem = {}
for i, name in enumerate(names, start=1):
    placeholder = "[S" + str(i) + "]"
    template = template.replace(placeholder, name)
ioi_prompts_elem['text'] = template

for i, name in enumerate(names, start=1):
    placeholder = "S" + str(i)
    ioi_prompts_elem[placeholder] = name

ioi_prompts = [ioi_prompts_elem]

In [None]:
ioi_prompts

[{'text': 'Alice is a teacher. Bob is a student. The child is Bob. Carol is a teacher. David is a student. The child is',
  'S1': 'Alice',
  'S2': 'Bob',
  'S3': 'Carol',
  'S4': 'David'}]

In [None]:
N=1
dataset = Dataset(ioi_prompts, model.tokenizer, N)

### Rewrite copy scores to not use ioi_dataset

In [13]:
def check_copy_circuit_2(model, layer, head, ioi_dataset, verbose=False, neg=False, print_tokens=True):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    S_pred_tokens = {}
    subjects_moved = []
    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["S1", "S2", "S3", "S4"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            S_pred_tokens[prompt[word]] = pred_tokens
            # if "S" in word:
            #     name = "S"
            # else:
            #     name = word
            if " " + prompt[word] in pred_tokens:
                n_right += 1
                subjects_moved.append(prompt[word])
            # else:
            #     if verbose:
            #         print("-------")
            #         print("Seq: " + ioi_dataset.sentences[seq_idx])
            #         print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
            #         print(
            #             " ".join(
            #                 [
            #                     f"({i+1}):{model.tokenizer.decode(token)}"
            #                     for i, token in enumerate(
            #                         torch.topk(
            #                             logits[
            #                                 seq_idx, ioi_dataset.word_idx[word][seq_idx]
            #                             ],
            #                             k,
            #                         ).indices
            #                     )
            #                 ]
            #             )
            #         )
    percent_right = (n_right / (ioi_dataset.N * 4)) * 100
    print(f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"  )
    if print_tokens == True:
        return S_pred_tokens
    else:
        return subjects_moved

In [None]:
check_copy_circuit_2(model, 9, 9, dataset)

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 75.0%


{'Alice': [' Titus', ' Bits', ' Ner', ' Fit', ' Kod'],
 'Bob': [' Bob', 'Bob', ' bob', 'ub', 'ob'],
 'Carol': [' Carol', ' CAR', ' Carroll', ' Charl', ' Carnegie'],
 'David': [' David', 'David', ' david', 'avid', ' Davidson']}

Compare to random head

In [None]:
check_copy_circuit_2(model, 3, 2, dataset)

Copy circuit for head 3.2 (sign=1) : Top 5 accuracy: 0.0%


{'Alice': ['atre', '装', 'yne', '中', 'ilda'],
 'Bob': [' Mason', 'ovember', ' McMaster', '�', 'claw'],
 'Carol': ['wat', ' Tank', ' McMaster', 'wan', 'ants'],
 'David': [' QC', ' Hague', 'wat', ' Stock', ' Stevenson']}

## Get important heads

Find what heads are specific to certain inputs, and what's common to the template.

Get important heads from:

simple_analogies_circuits.ipynb

https://colab.research.google.com/drive/1mhcgx2SU3GrDq3pMZp_-JPtE_fO-7kGg#scrollTo=_ChKijEui-KV&line=3&uniqifier=1

most_recent_S_attn_pat.ipynb

https://colab.research.google.com/drive/1KaqcS92-BI4FZ7m-r8rCW9tIovxA_s93#scrollTo=VcFgqbcF4YvI

Positives (blue) List:

L8, H11

L9, H9



```
Top value 1: Row=9, Column=9, Value=2.5532712936401367
Top value 2: Row=8, Column=11, Value=2.1216535568237305
Top value 3: Row=10, Column=6, Value=1.6274135112762451
Top value 4: Row=11, Column=1, Value=0.37464624643325806
Top value 5: Row=8, Column=6, Value=0.36867403984069824
```

In [None]:
top_val = [(9, 9), (8, 11), (10, 6), (11, 1), (8, 6)]
for layer, head in top_val:
    print(check_copy_circuit_2(model, layer, head, dataset, print_tokens=False))

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 50.0%
['Bob', 'David']
Copy circuit for head 11.1 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 8.6 (sign=1) : Top 5 accuracy: 0.0%
[]


In [None]:
top_val = [(9, 9), (8, 11), (10, 6), (11, 1), (8, 6), (9, 2), (2, 10), (11, 8), (8, 8), (5, 11)]
for index, (layer, head) in enumerate(top_val):
    print(index, check_copy_circuit_2(model, layer, head, dataset, print_tokens=False))

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 75.0%
0 ['Bob', 'Carol', 'David']
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 75.0%
1 ['Bob', 'Carol', 'David']
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 50.0%
2 ['Bob', 'David']
Copy circuit for head 11.1 (sign=1) : Top 5 accuracy: 75.0%
3 ['Bob', 'Carol', 'David']
Copy circuit for head 8.6 (sign=1) : Top 5 accuracy: 0.0%
4 []
Copy circuit for head 9.2 (sign=1) : Top 5 accuracy: 0.0%
5 []
Copy circuit for head 2.10 (sign=1) : Top 5 accuracy: 0.0%
6 []
Copy circuit for head 11.8 (sign=1) : Top 5 accuracy: 0.0%
7 []
Copy circuit for head 8.8 (sign=1) : Top 5 accuracy: 25.0%
8 ['Carol']
Copy circuit for head 5.11 (sign=1) : Top 5 accuracy: 0.0%
9 []


Find similarity of these top results to "David"

Look at random heads

In [None]:
check_copy_circuit_2(model, 10, 10, dataset)

Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 50.0%


(50.0, [' Del', 'Del', ' EC', ' Dak', ' De'])

In [None]:
check_copy_circuit_2(model, 10, 0, dataset)

Copy circuit for head 10.0 (sign=1) : Top 5 accuracy: 50.0%


(50.0, ['Israel', ' Israel', 'Israeli', ' Tel', ' Israeli'])

In [None]:
check_copy_circuit_2(model, 9, 0, dataset)

Copy circuit for head 9.0 (sign=1) : Top 5 accuracy: 0.0%


(0.0, [' Animation', ' CGI', ' studio', 'amera', ' clients'])

## Test if S1 isn't recognized bc Alice or if error in how S1 defined in Dataset code

In [16]:
def make_ioi_prompts(template, names):
    ioi_prompts_elem = {}
    for i, name in enumerate(names, start=1):
        placeholder = "[S" + str(i) + "]"
        template = template.replace(placeholder, name)
    ioi_prompts_elem['text'] = template

    for i, name in enumerate(names, start=1):
        placeholder = "S" + str(i)
        ioi_prompts_elem[placeholder] = name

    ioi_prompts = [ioi_prompts_elem]
    return ioi_prompts

template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
names = ["Adam", "Bob", "Carol", "David"]
ioi_prompts_2 = make_ioi_prompts(template, names)
N=1
dataset_2 = Dataset(ioi_prompts_2, model.tokenizer, N)

In [17]:
top_val = [(9, 9), (8, 11), (10, 6), (11, 1), (8, 6)]
for layer, head in top_val:
    print(check_copy_circuit_2(model, layer, head, dataset_2, print_tokens=False))

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 50.0%
['Bob', 'David']
Copy circuit for head 11.1 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 8.6 (sign=1) : Top 5 accuracy: 25.0%
['Carol']


There is an error with S1. Fix it below:

## New Dataset Class (Fixed)

In [18]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

In [19]:
dataset_2_fixed = Dataset(ioi_prompts_2, model.tokenizer, N)
for layer, head in top_val:
    print(check_copy_circuit_2(model, layer, head, dataset_2_fixed, print_tokens=False))

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 100.0%
['Adam', 'Bob', 'Carol', 'David']
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 75.0%
['Bob', 'Carol', 'David']
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 50.0%
['Bob', 'David']
Copy circuit for head 11.1 (sign=1) : Top 5 accuracy: 100.0%
['Adam', 'Bob', 'Carol', 'David']
Copy circuit for head 8.6 (sign=1) : Top 5 accuracy: 25.0%
['Carol']


# Writing direction results with scatterplot

In [None]:
def make_ioi_prompts(template, names):
    ioi_prompts_elem = {}
    for i, name in enumerate(names, start=1):
        placeholder = "[S" + str(i) + "]"
        template = template.replace(placeholder, name)
    ioi_prompts_elem['text'] = template

    for i, name in enumerate(names, start=1):
        placeholder = "S" + str(i)
        ioi_prompts_elem[placeholder] = name

    ioi_prompts = [ioi_prompts_elem]
    return ioi_prompts

class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

N=1
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
names = ["Adam", "Bob", "Carol", "David"]
ioi_prompts_2 = make_ioi_prompts(template, names)
dataset_2_fixed = Dataset(ioi_prompts_2, model.tokenizer, N)

In [None]:
def scatter_attention_and_contribution(
    model,
    layer_no,
    head_no,
    ioi_dataset,
    return_vals=False,
    return_fig=False,
):
    """
    Plot a scatter plot
    for each input sequence with the attention paid to IO and S
    and the amount that is written in the IO and S directions
    """

    n_heads = model.cfg.n_heads
    n_layers = model.cfg.n_layers
    model_unembed = model.unembed.W_U.detach().cpu()
    df = []
    cache = {}
    model.cache_all(cache)

    logits = model(ioi_dataset.toks.long())

    for i, prompt in enumerate(ioi_dataset.ioi_prompts):

        io_tok = model.tokenizer(" " + prompt["S3"])["input_ids"][0]
        s_tok = model.tokenizer(" " + prompt["S2"])["input_ids"][0]
        toks = model.tokenizer(prompt["text"])["input_ids"]
        io_pos = toks.index(io_tok)
        s1_pos = toks.index(s_tok)
        s2_pos = toks[s1_pos + 1 :].index(s_tok) + (s1_pos + 1)
        # assert toks[-1] == io_tok

        io_dir = model_unembed[:, io_tok].detach()
        s_dir = model_unembed[:, s_tok].detach()

        # model.reset_hooks() # should allow things to be done with ablated models

        for dire, posses, tok_type in [
            (io_dir, [io_pos], "S3"),
            (s_dir, [s1_pos, s2_pos], "S2"),
        ]:
            prob = sum(
                [
                    cache[f"blocks.{layer_no}.attn.hook_attn"][
                        i, head_no, ioi_dataset.word_idx["end"][i], pos
                    ]
                    .detach()
                    .cpu()
                    for pos in posses
                ]
            )
            resid = (
                cache[f"blocks.{layer_no}.attn.hook_result"][
                    i, ioi_dataset.word_idx["end"][i], head_no, :
                ]
                .detach()
                .cpu()
            )
            dot = torch.einsum("a,a->", resid, dire)
            df.append([prob, dot, tok_type, prompt["text"]])

    # most of the pandas stuff is intuitive, no need to deeply understand
    viz_df = pd.DataFrame(
        df, columns=[f"Attn Prob on Name", f"Dot w Name Embed", "Name Type", "text"]
    )
    fig = px.scatter(
        viz_df,
        x=f"Attn Prob on Name",
        y=f"Dot w Name Embed",
        color="Name Type",
        hover_data=["text"],
        color_discrete_sequence=["rgb(114,255,100)", "rgb(201,165,247)"],
        title=f"How Strong {layer_no}.{head_no} Writes in the Name Embed Direction Relative to Attn Prob",
    )

    if return_vals:
        return viz_df
    if return_fig:
        return fig
    else:
        fig.show()

In [None]:
dataset_2_fixed = Dataset(ioi_prompts_2, model.tokenizer, N)

In [None]:
scatter_attention_and_contribution(
    model=model, layer_no=9, head_no=9, ioi_dataset=dataset_2_fixed
)

## Analyze variables in scatterplot()

In [None]:
"""
Plot a scatter plot
for each input sequence with the attention paid to IO and S
and the amount that is written in the IO and S directions
"""
layer_no=9
head_no=9
ioi_dataset=dataset_2_fixed

n_heads = model.cfg.n_heads
n_layers = model.cfg.n_layers
model_unembed = model.unembed.W_U.detach().cpu()
df = []
cache = {}
model.cache_all(cache)

logits = model(ioi_dataset.toks.long())

for i, prompt in enumerate(ioi_dataset.ioi_prompts):

    io_tok = model.tokenizer(" " + prompt["S3"])["input_ids"][0]
    s_tok = model.tokenizer(" " + prompt["S2"])["input_ids"][0]
    toks = model.tokenizer(prompt["text"])["input_ids"]
    io_pos = toks.index(io_tok)
    s1_pos = toks.index(s_tok)
    s2_pos = toks[s1_pos + 1 :].index(s_tok) + (s1_pos + 1)
    # assert toks[-1] == io_tok

    io_dir = model_unembed[:, io_tok].detach()
    s_dir = model_unembed[:, s_tok].detach()

    # model.reset_hooks() # should allow things to be done with ablated models

    for dire, posses, tok_type in [
        (io_dir, [io_pos], "S3"),
        (s_dir, [s1_pos, s2_pos], "S2"),
    ]:
        prob = sum(
            [
                cache[f"blocks.{layer_no}.attn.hook_attn"][
                    i, head_no, ioi_dataset.word_idx["end"][i], pos
                ]
                .detach()
                .cpu()
                for pos in posses
            ]
        )
        resid = (
            cache[f"blocks.{layer_no}.attn.hook_result"][
                i, ioi_dataset.word_idx["end"][i], head_no, :
            ]
            .detach()
            .cpu()
        )
        dot = torch.einsum("a,a->", resid, dire)
        df.append([prob, dot, tok_type, prompt["text"]])

# most of the pandas stuff is intuitive, no need to deeply understand
viz_df = pd.DataFrame(
    df, columns=[f"Attn Prob on Name", f"Dot w Name Embed", "Name Type", "text"]
)
fig = px.scatter(
    viz_df,
    x=f"Attn Prob on Name",
    y=f"Dot w Name Embed",
    color="Name Type",
    hover_data=["text"],
    color_discrete_sequence=["rgb(114,255,100)", "rgb(201,165,247)"],
    title=f"How Strong {layer_no}.{head_no} Writes in the Name Embed Direction Relative to Attn Prob",
)

# fig.show()

In [None]:
cache[f"blocks.{layer_no}.attn.hook_result"].shape

torch.Size([1, 28, 12, 768])

The last dim is vocab size

In [None]:
s_dir.shape

torch.Size([768])

In [None]:
resid.shape

torch.Size([768])

In [None]:
ioi_dataset.word_idx["end"]

tensor([27])

In [None]:
ioi_dataset.word_idx["end"].shape

torch.Size([1])

This is a tensor of dim 1, which contains the value 27. When there are more prompts, there are more dims. Each contains the end index of the prompt.

In [None]:
model_unembed.shape

torch.Size([768, 50257])

# Generate more prompts for dataset

In [24]:
#@title Names list
names = [
    "Michael",
    "Christopher",
    "Jessica",
    "Matthew",
    "Ashley",
    "Jennifer",
    "Joshua",
    "Amanda",
    "Daniel",
    "David",
    "James",
    "Robert",
    "John",
    "Joseph",
    "Andrew",
    "Ryan",
    "Brandon",
    "Jason",
    "Justin",
    "Sarah",
    "William",
    "Jonathan",
    "Stephanie",
    "Brian",
    "Nicole",
    "Nicholas",
    "Anthony",
    "Heather",
    "Eric",
    "Elizabeth",
    "Adam",
    "Megan",
    "Melissa",
    "Kevin",
    "Steven",
    "Thomas",
    "Timothy",
    "Christina",
    "Kyle",
    "Rachel",
    "Laura",
    "Lauren",
    "Amber",
    "Brittany",
    "Danielle",
    "Richard",
    "Kimberly",
    "Jeffrey",
    "Amy",
    "Crystal",
    "Michelle",
    "Tiffany",
    "Jeremy",
    "Benjamin",
    "Mark",
    "Emily",
    "Aaron",
    "Charles",
    "Rebecca",
    "Jacob",
    "Stephen",
    "Patrick",
    "Sean",
    "Erin",
    "Jamie",
    "Kelly",
    "Samantha",
    "Nathan",
    "Sara",
    "Dustin",
    "Paul",
    "Angela",
    "Tyler",
    "Scott",
    "Katherine",
    "Andrea",
    "Gregory",
    "Erica",
    "Mary",
    "Travis",
    "Lisa",
    "Kenneth",
    "Bryan",
    "Lindsey",
    "Kristen",
    "Jose",
    "Alexander",
    "Jesse",
    "Katie",
    "Lindsay",
    "Shannon",
    "Vanessa",
    "Courtney",
    "Christine",
    "Alicia",
    "Cody",
    "Allison",
    "Bradley",
    "Samuel",
]

In [None]:
import random

def make_latestS_prompts(names, template, num_sentences):
    sentences = []
    generated_set = set() # Ensure none of the generated sentences are the same
    while len(sentences) < num_sentences:
        unique_names = random.sample(names, k=4)
        temp_template = template
        sentence_dict = {}
        for i, name in enumerate(unique_names, start=1):
            temp_template = temp_template.replace(f"[S{i}]", name)
            sentence_dict[f'S{i}'] = name
        sentence_dict['text'] = temp_template
        if sentence_dict['text'] not in generated_set:
            generated_set.add(sentence_dict['text'])
            sentences.append(sentence_dict)
    return sentences

template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
N=10
latestS_prompts = make_latestS_prompts(names, template, N)
latestS_prompts

## Look into bug about name tokens

In [None]:
dataset_2 = Dataset(latestS_prompts, model.tokenizer, N)

ValueError: ignored

In [None]:
latestS_prompts[1]

{'S1': 'Lindsey',
 'S2': 'Erin',
 'S3': 'Michelle',
 'S4': 'Bryan',
 'text': 'Lindsey is a teacher. Erin is a student. The child is Erin. Michelle is a teacher. Bryan is a student. The child is'}

In [None]:
tokens = model.tokenizer.tokenize(latestS_prompts[1]["text"])
target_token = latestS_prompts[1]['S1']
target_index = tokens.index(target_token)

ValueError: ignored

In [None]:
tokens = model.tokenizer.tokenize(latestS_prompts[1]["text"])
target_token = "Ġ" + latestS_prompts[1]['S1']
target_index = tokens.index(target_token)

ValueError: ignored

I assumed all the names were single token. Let's check.

In [None]:
tokens

['Lind',
 'sey',
 'Ġis',
 'Ġa',
 'Ġteacher',
 '.',
 'ĠErin',
 'Ġis',
 'Ġa',
 'Ġstudent',
 '.',
 'ĠThe',
 'Ġchild',
 'Ġis',
 'ĠErin',
 '.',
 'ĠMichelle',
 'Ġis',
 'Ġa',
 'Ġteacher',
 '.',
 'ĠBryan',
 'Ġis',
 'Ġa',
 'Ġstudent',
 '.',
 'ĠThe',
 'Ġchild',
 'Ġis']

In [None]:
model.tokenizer.tokenize("Lindsey")

['Lind', 'sey']

Thus, loop over names and remove those with more than one token

In [26]:
def filter_names(names):
    return [name for name in names if len(model.tokenizer.tokenize(name)) == 1]
names = filter_names(names)

### Make dataset after fixing single name tokens bug

In [27]:
latestS_prompts = make_latestS_prompts(names, template, N)
dataset_2_fixed = Dataset(latestS_prompts, model.tokenizer, N)

In [None]:
scatter_attention_and_contribution(
    model=model, layer_no=9, head_no=9, ioi_dataset=dataset_2_fixed
)

NOTE: doesn't work w/ S1 due to bug about space in front, so fix that in scatter_plot

## Modify scatterplot() to handle >2 subjs: Fix space in front of S1 bug

Make other changes to scatter_plot to convert its focus on IO and S to S1, S2, etc.

In [6]:
def scatter_attention_and_contribution(
    model,
    layer_no,
    head_no,
    ioi_dataset,
    return_vals=False,
    return_fig=False,
):
    """
    Plot a scatter plot
    for each input sequence with the attention paid to IO and S
    and the amount that is written in the IO and S directions
    """

    n_heads = model.cfg.n_heads
    n_layers = model.cfg.n_layers
    model_unembed = model.unembed.W_U.detach().cpu()
    df = []
    cache = {}
    model.cache_all(cache)

    logits = model(ioi_dataset.toks.long())

    for i, prompt in enumerate(ioi_dataset.ioi_prompts):

        s1_tok = model.tokenizer(prompt["S1"])["input_ids"][0]
        s2_tok = model.tokenizer(" " + prompt["S2"])["input_ids"][0]
        s3_tok = model.tokenizer(" " + prompt["S3"])["input_ids"][0]
        s4_tok = model.tokenizer(" " + prompt["S4"])["input_ids"][0]

        toks = model.tokenizer(prompt["text"])["input_ids"]
        s1_pos = toks.index(s1_tok)
        s2_pos = toks.index(s2_tok)
        s3_pos = toks.index(s3_tok)
        s4_pos = toks.index(s4_tok)

        s1_dir = model_unembed[:, s1_tok].detach()
        s2_dir = model_unembed[:, s2_tok].detach()
        s3_dir = model_unembed[:, s3_tok].detach()
        s4_dir = model_unembed[:, s4_tok].detach()

        # model.reset_hooks() # should allow things to be done with ablated models

        for dire, posses, tok_type in [
            (s1_dir, [s1_pos], "S1"),
            (s2_dir, [s2_pos], "S2"),
            (s3_dir, [s3_pos], "S3"),
            (s4_dir, [s4_pos], "S4"),
        ]:
            prob = sum(
                [
                    cache[f"blocks.{layer_no}.attn.hook_attn"][
                        i, head_no, ioi_dataset.word_idx["end"][i], pos
                    ]
                    .detach()
                    .cpu()
                    for pos in posses
                ]
            )
            resid = (
                cache[f"blocks.{layer_no}.attn.hook_result"][
                    i, ioi_dataset.word_idx["end"][i], head_no, :
                ]
                .detach()
                .cpu()
            )
            dot = torch.einsum("a,a->", resid, dire)
            df.append([prob, dot, tok_type, prompt["text"]])

    # most of the pandas stuff is intuitive, no need to deeply understand
    viz_df = pd.DataFrame(
        df, columns=[f"Attn Prob on Name", f"Dot w Name Embed", "Name Type", "text"]
    )
    fig = px.scatter(
        viz_df,
        x=f"Attn Prob on Name",
        y=f"Dot w Name Embed",
        color="Name Type",
        hover_data=["text"],
        # color_discrete_sequence=["rgb(114,255,100)", "rgb(201,165,247)"],
        title=f"How Strong {layer_no}.{head_no} Writes in the Name Embed Direction Relative to Attn Prob",
    )

    if return_vals:
        return viz_df
    if return_fig:
        return fig
    else:
        fig.show()

In [28]:
scatter_attention_and_contribution(
    model=model, layer_no=9, head_no=9, ioi_dataset=dataset_2_fixed
)

## Correlation vals

# Copy Scores for multiple prompts

In [None]:
def check_copy_circuit_multi(model, layer, head, ioi_dataset, verbose=False, neg=False, print_tokens=True):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    S_pred_tokens = {}
    subjects_moved = []
    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["S1", "S2", "S3", "S4"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            S_pred_tokens[prompt[word]] = pred_tokens
            if " " + prompt[word] in pred_tokens:
                n_right += 1
                subjects_moved.append(prompt[word])
    percent_right = (n_right / (ioi_dataset.N * 4)) * 100
    print(f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"  )
    if print_tokens == True:
        return S_pred_tokens
    else:
        return subjects_moved

# Find what affects name mover heads

We found name mover heads. Now, we use another method to see what selects the 'most recent subject' from these name mover heads. That is, another head goes in input to name mover heads so the final calculation is influenced to favor "David" over the others.

Check if these name movers are the same from IOI paper. From paper, Name Mover Heads: 9.9 9.6 10.0 (figure 1)

Yes, 9.9 is the same. But others such as 8.11 and 11.1 are absent in the attention head of the paper (figure 3b). Were they not sigf enough and just not reported? Look at the hard-coded circuit variable to see all the name movers identified (includes backup name movers in same key):

In [None]:
CIRCUIT

{'name mover': [(9, 9),
  (10, 0),
  (9, 6),
  (10, 10),
  (10, 6),
  (10, 2),
  (10, 1),
  (11, 2),
  (9, 7),
  (9, 0),
  (11, 9)],
 'negative': [(10, 7), (11, 10)],
 's2 inhibition': [(7, 3), (7, 9), (8, 6), (8, 10)],
 'induction': [(5, 5), (5, 8), (5, 9), (6, 9)],
 'duplicate token': [(0, 1), (0, 10), (3, 0)],
 'previous token': [(2, 2), (4, 11)]}

In [None]:
CIRCUIT_NEW = {'adjective mover': [(30, 13)]}

See how S-inhibitions found:

https://colab.research.google.com/drive/1YM-0MPw0KKKkjRU855Js3HxBHDgePL1S#scrollTo=ysfYnoon4uuN

# Path patching

**do_circuit_extraction**(): Add hooks to the model to obtain intermediate activations when running path patching, copy scores, etc

## Re-run the following if restarting notebook run

In [None]:
#@title Names list
names = [
    "Michael",
    "Christopher",
    "Jessica",
    "Matthew",
    "Ashley",
    "Jennifer",
    "Joshua",
    "Amanda",
    "Daniel",
    "David",
    "James",
    "Robert",
    "John",
    "Joseph",
    "Andrew",
    "Ryan",
    "Brandon",
    "Jason",
    "Justin",
    "Sarah",
    "William",
    "Jonathan",
    "Stephanie",
    "Brian",
    "Nicole",
    "Nicholas",
    "Anthony",
    "Heather",
    "Eric",
    "Elizabeth",
    "Adam",
    "Megan",
    "Melissa",
    "Kevin",
    "Steven",
    "Thomas",
    "Timothy",
    "Christina",
    "Kyle",
    "Rachel",
    "Laura",
    "Lauren",
    "Amber",
    "Brittany",
    "Danielle",
    "Richard",
    "Kimberly",
    "Jeffrey",
    "Amy",
    "Crystal",
    "Michelle",
    "Tiffany",
    "Jeremy",
    "Benjamin",
    "Mark",
    "Emily",
    "Aaron",
    "Charles",
    "Rebecca",
    "Jacob",
    "Stephen",
    "Patrick",
    "Sean",
    "Erin",
    "Jamie",
    "Kelly",
    "Samantha",
    "Nathan",
    "Sara",
    "Dustin",
    "Paul",
    "Angela",
    "Tyler",
    "Scott",
    "Katherine",
    "Andrea",
    "Gregory",
    "Erica",
    "Mary",
    "Travis",
    "Lisa",
    "Kenneth",
    "Bryan",
    "Lindsey",
    "Kristen",
    "Jose",
    "Alexander",
    "Jesse",
    "Katie",
    "Lindsay",
    "Shannon",
    "Vanessa",
    "Courtney",
    "Christine",
    "Alicia",
    "Cody",
    "Allison",
    "Bradley",
    "Samuel",
]

def filter_names(names):
    return [name for name in names if len(model.tokenizer.tokenize(name)) == 1]
names = filter_names(names)

In [None]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

In [None]:
import random

def make_latestS_prompts(names, template, num_sentences):
    sentences = []
    generated_set = set() # Ensure none of the generated sentences are the same
    while len(sentences) < num_sentences:
        unique_names = random.sample(names, k=4)
        temp_template = template
        sentence_dict = {}
        for i, name in enumerate(unique_names, start=1):
            temp_template = temp_template.replace(f"[S{i}]", name)
            sentence_dict[f'S{i}'] = name
        sentence_dict['text'] = temp_template
        if sentence_dict['text'] not in generated_set:
            generated_set.add(sentence_dict['text'])
            sentences.append(sentence_dict)
    return sentences

template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
N=10
latestS_prompts = make_latestS_prompts(names, template, N)
dataset_orig = Dataset(latestS_prompts, model.tokenizer, N)

## Run path patching after obtaining dataset making code above

In [None]:
# we make the ABC dataset in order to knockout other model components
# abc_dataset = (  # TODO seeded
#     ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
#     .gen_flipped_prompts(("S", "RAND"))
#     .gen_flipped_prompts(("S1", "RAND"))
# )

# switch order of target sentence
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S4] is a student. [S3] is a teacher. The child is"
names = ["Adam", "Bob", "Carol", "David"]
latestS_prompts = make_latestS_prompts(names, template, N)
dataset_corr = Dataset(latestS_prompts, model.tokenizer, N)

In [None]:
circuit = deepcopy(CIRCUIT)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset_orig, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset_orig,
    mean_dataset=dataset_corr,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)

AttributeError: ignored

Now we require more variables. This shows just 'sentences', so add it to Dataset

https://github.com/redwoodresearch/Easy-Transformer/blob/main/easy_transformer/ioi_circuit_extraction.py#L152

In [None]:
N=10
custom_templates = [
    " The human is [A]. The animal is [B]. The human is",
]
ioi_dataset = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)
ioi_dataset.sentences



[' The human is Amanda. The animal is Adam. The human is',
 ' The human is Daniel. The animal is Bradley. The human is',
 ' The human is Paul. The animal is Jonathan. The human is',
 ' The human is Vanessa. The animal is Rachel. The human is',
 ' The human is Ryan. The animal is Megan. The human is',
 ' The human is Andrea. The animal is Paul. The human is',
 ' The human is Shannon. The animal is Lisa. The human is',
 ' The human is Amber. The animal is David. The human is',
 ' The human is Heather. The animal is Dustin. The human is',
 ' The human is Jeremy. The animal is Nicole. The human is']

In [None]:
ioi_prompts[0]['text']

'Alice is a teacher. Bob is a student. The child is Bob. Carol is a teacher. David is a student. The child is'

## Update Dataset class to include 'sentences'

In [None]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N
        self.sentences = [ ioi_prompts[0]['text'] ]

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

After modifying Dataset to include 'sentences', re-create the datasets

In [None]:
# %debug
circuit = deepcopy(CIRCUIT)

N = 1
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
prompts = make_latestS_prompts(names, template, N)
dataset = Dataset(prompts, model.tokenizer, N)

# switch order of target sentence
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S4] is a student. [S3] is a teacher. The child is"
corr_prompts = make_latestS_prompts(names, template, N)
dataset_corr = Dataset(corr_prompts, model.tokenizer, N)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset,
    mean_dataset=dataset_corr,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)

{'S1': tensor([0]), 'S2': tensor([5]), 'S3': tensor([15]), 'S4': tensor([20]), 'end': tensor([27])} S+1


ValueError: ignored

See traceback:

do_circuit_extraction( THEN PASS AS ARG: get_heads_circuit() -> get_extracted_idx(RELEVANT_TOKENS) )

https://github.com/redwoodresearch/Easy-Transformer/blob/main/easy_transformer/ioi_circuit_extraction.py#L161

get_heads_circuit():
`heads_to_keep[head] = get_extracted_idx(RELEVANT_TOKENS[head], ioi_dataset)`

get_extracted_idx(RELEVANT_TOKENS):
```
# idx_list := RELEVANT_TOKENS
for idx_name in idx_list:
        try:
            int_idx_to_add = [
                int(x) for x in list(ioi_dataset.word_idx[idx_name])
            ]  # torch to python objects
```

RELEVANT_TOKENS:
https://github.com/redwoodresearch/Easy-Transformer/blob/94ed3599b17209c69eb96973c8b61d8ee98a9dc9/easy_transformer/ioi_circuit_extraction.py#L205

head comes from the keys in CIRCUIT. Each head is (L, h).
https://github.com/redwoodresearch/Easy-Transformer/blob/94ed3599b17209c69eb96973c8b61d8ee98a9dc9/easy_transformer/ioi_circuit_extraction.py#L168

idx_list uses the constant RELEVANT_TOKENS, which translates the circuit class into the query token it works on (figure 1). This is done by getting it for each [head]. We can either fork the project and manually alter this hard-coding, or we can write smaller custom functions in this nb.

idx_name is just a string of the token. It's a value for RELEVANT_TOKENS[head]. RELEVANT_TOKENS says what TYPES of query tokens "activate highly" for that head. For instance, head (9,9) is mapped to "end" through RELEVANT_TOKENS.

CIRCUIT specifies the type of head, while RELEVANT_TOKENS specifies the query token position (dest).

In [None]:
# this contains token positions for that token type
dataset.word_idx

{'S1': tensor([0]),
 'S2': tensor([5]),
 'S3': tensor([15]),
 'S4': tensor([20]),
 'end': tensor([27])}

RELEVANT_TOKENS can be modified for the 4 subjs. We may not need S+1 because right now we are only concerned with name mover heads (which is mapped to "end") and s1-, s2-, and s3-inhibition heads (which are also mapped to "end"). In the circuit, only previous_token heads are mapped to "S+1"; we don't look at them for now.

To change RELEVANT_TOKENS, fork the repo and use your new repo with the modified RELEVANT_TOKENS. Alternatively, just upload the file with the new change (since it's so small). Download the old file to mod.

One strange thing is how did they know the type of heads already if they haven't found the circuit yet? Perhaps it wasn't found with path patching, but just actv patching.

## Run after modifying RELEVANT_TOKENS

After modifying newly forked repo: In RELEVANT_TOKENS, comment all out except RELEVANT_TOKENS[head] = ["end"] . Then commit ioi_circuit_extraction.py and try do_circuit_extraction() again

In [None]:
# %debug
circuit = deepcopy(CIRCUIT)

N = 1
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
prompts = make_latestS_prompts(names, template, N)
dataset = Dataset(prompts, model.tokenizer, N)

# switch order of target sentence
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S4] is a student. [S3] is a teacher. The child is"
corr_prompts = make_latestS_prompts(names, template, N)
dataset_corr = Dataset(corr_prompts, model.tokenizer, N)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset,
    mean_dataset=dataset_corr,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)

KeyError: ignored

This bug is due to RELEVANT_TOKENS not containing the key (5,5) from

```
"induction": [(5, 5), (5, 8), (5, 9), (6, 9)],
```
in CIRCUIT, so in CIRCUIT comment out all the head types you commented in RELEVANT_TOKENS then commit again.


In [None]:
# %debug
circuit = deepcopy(CIRCUIT)

N = 1
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
prompts = make_latestS_prompts(names, template, N)
dataset = Dataset(prompts, model.tokenizer, N)

# switch order of target sentence
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S4] is a student. [S3] is a teacher. The child is"
corr_prompts = make_latestS_prompts(names, template, N)
dataset_corr = Dataset(corr_prompts, model.tokenizer, N)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset,
    mean_dataset=dataset_corr,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)

AttributeError: ignored

## Update Dataset to have max_len and groups

In [None]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N
        self.sentences = [
            prompt["text"] for prompt in self.ioi_prompts
        ]  # a list of strings. Renamed as this should NOT be forward passed
        self.max_len = max(
            [
                len(self.tokenizer(prompt["text"]).input_ids)
                for prompt in self.ioi_prompts
            ]
        )

        # add in ioi_prompt["TEMPLATE_IDX"] = temp_id from def gen_prompt_uniform()


        all_ids = [prompt["TEMPLATE_IDX"] for prompt in self.ioi_prompts]
        all_ids_ar = np.array(all_ids)
        self.groups = []
        for id in list(set(all_ids)):
            self.groups.append(np.where(all_ids_ar == id)[0])

        small_groups = []
        for group in self.groups:
            if len(group) < 5:
                small_groups.append(len(group))
        if len(small_groups) > 0:
            warnings.warn(
                f"Some groups have less than 5 prompts, they have lengths {small_groups}"
            )

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

In [None]:
# %debug
circuit = deepcopy(CIRCUIT)

N = 1
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
prompts = make_latestS_prompts(names, template, N)
dataset = Dataset(prompts, model.tokenizer, N)

# switch order of target sentence
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S4] is a student. [S3] is a teacher. The child is"
corr_prompts = make_latestS_prompts(names, template, N)
dataset_corr = Dataset(corr_prompts, model.tokenizer, N)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset,
    mean_dataset=dataset_corr,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)

AttributeError: ignored

Now try N=10

## FOR LATER AFTER FIX CIRCUIT EXTRACTION

Iterates over each layer:

In [None]:
def plot_path_patching(
    model,
    ioi_dataset,
    receiver_hooks,  # list of tuples (hook_name, idx). If idx is not None, then at dim 2 index in with idx (used for doing things for specific attention heads)
    position,
):
    model.reset_hooks()
    default_logit_diff = logit_diff(model, ioi_dataset)
    results = torch.zeros(size=(30, 30))  # change this to 30 x 30 for large
    mlp_results = torch.zeros(size=(30, 1))
    for source_layer in tqdm(range(30)):
        for source_head_idx in [None] + list(range(30)):
            model.reset_hooks()

            model = path_patching(
                model=model,
                D_new=dataset_corr,
                D_orig=ioi_dataset,
                sender_heads=[(source_layer, source_head_idx)],
                receiver_hooks=receiver_hooks,
                positions=[position],
                return_hooks=False,
                freeze_mlps=False,
                have_internal_interactions=False,
            )
            cur_logit_diff = logit_diff(model, ioi_dataset)

            if source_head_idx is None:
                mlp_results[source_layer] = cur_logit_diff - default_logit_diff
            else:
                results[source_layer][source_head_idx] = (
                    cur_logit_diff - default_logit_diff
                )

            if source_layer == 1:
                assert not torch.allclose(results, 0.0 * results), results

            if source_layer == 29 and source_head_idx == 29:  # chagne to 29 for large
                results /= default_logit_diff
                mlp_results /= default_logit_diff

                results *= 100
                mlp_results *= 100

                # show attention head results
                fig = show_pp(
                    results,
                    title=f"Effect of patching (Heads->Final Residual Stream State) path",
                    return_fig=True,
                    show_fig=False,
                    bartitle="% change in logit difference",
                )
                fig.show()


plot_path_patching(
    model,
    dataset,
    receiver_hooks=[(f"blocks.{model.cfg.n_layers-1}.hook_resid_post", None)],
    position="end",
)

  0%|          | 0/30 [00:00<?, ?it/s]


NameError: ignored