# Setup

In [1]:
%%capture
# %pip install git+https://github.com/redwoodresearch/Easy-Transformer.git
%pip install git+https://github.com/wlg1/Easy-Transformer.git
%pip install einops datasets transformers fancy_einsum

In [2]:
from copy import deepcopy
import torch

assert torch.cuda.device_count() == 1
from tqdm import tqdm
import pandas as pd
import torch
import torch as t
from easy_transformer.EasyTransformer import (
    EasyTransformer,
)
from time import ctime
from functools import partial

import numpy as np
from tqdm import tqdm
import pandas as pd

from easy_transformer.experiments import (
    ExperimentMetric,
    AblationConfig,
    EasyAblation,
    EasyPatching,
    PatchingConfig,
)
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import random
import einops
from IPython import get_ipython
from copy import deepcopy
from easy_transformer.ioi_dataset import (
    IOIDataset,
)
from easy_transformer.ioi_utils import (
    path_patching,
    max_2d,
    CLASS_COLORS,
    show_pp,
    show_attention_patterns,
    scatter_attention_and_contribution,
)
from random import randint as ri
from easy_transformer.ioi_circuit_extraction import (
    do_circuit_extraction,
    get_heads_circuit,
    CIRCUIT,
)
from easy_transformer.ioi_utils import logit_diff, probs
from easy_transformer.ioi_utils import get_top_tokens_and_probs as g

ipython = get_ipython()
if ipython is not None:
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

 Initialise model (use larger N or fewer templates for no warnings about in-template ablation)

In [3]:
model = EasyTransformer.from_pretrained("gpt2").cuda()
# model = EasyTransformer.from_pretrained("gpt2")
model.set_use_attn_result(True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Moving model to device:  cuda
Finished loading pretrained model gpt2 into EasyTransformer!


# Generate dataset with multiple prompts

In [4]:
#@title Names list
names = [
    "Michael",
    "Christopher",
    "Jessica",
    "Matthew",
    "Ashley",
    "Jennifer",
    "Joshua",
    "Amanda",
    "Daniel",
    "David",
    "James",
    "Robert",
    "John",
    "Joseph",
    "Andrew",
    "Ryan",
    "Brandon",
    "Jason",
    "Justin",
    "Sarah",
    "William",
    "Jonathan",
    "Stephanie",
    "Brian",
    "Nicole",
    "Nicholas",
    "Anthony",
    "Heather",
    "Eric",
    "Elizabeth",
    "Adam",
    "Megan",
    "Melissa",
    "Kevin",
    "Steven",
    "Thomas",
    "Timothy",
    "Christina",
    "Kyle",
    "Rachel",
    "Laura",
    "Lauren",
    "Amber",
    "Brittany",
    "Danielle",
    "Richard",
    "Kimberly",
    "Jeffrey",
    "Amy",
    "Crystal",
    "Michelle",
    "Tiffany",
    "Jeremy",
    "Benjamin",
    "Mark",
    "Emily",
    "Aaron",
    "Charles",
    "Rebecca",
    "Jacob",
    "Stephen",
    "Patrick",
    "Sean",
    "Erin",
    "Jamie",
    "Kelly",
    "Samantha",
    "Nathan",
    "Sara",
    "Dustin",
    "Paul",
    "Angela",
    "Tyler",
    "Scott",
    "Katherine",
    "Andrea",
    "Gregory",
    "Erica",
    "Mary",
    "Travis",
    "Lisa",
    "Kenneth",
    "Bryan",
    "Lindsey",
    "Kristen",
    "Jose",
    "Alexander",
    "Jesse",
    "Katie",
    "Lindsay",
    "Shannon",
    "Vanessa",
    "Courtney",
    "Christine",
    "Alicia",
    "Cody",
    "Allison",
    "Bradley",
    "Samuel",
]

In [5]:
def filter_names(names):
    return [name for name in names if len(model.tokenizer.tokenize(name)) == 1]
names = filter_names(names)

In [6]:
import random

def make_latestS_prompts(names, template, num_sentences):
    sentences = []
    generated_set = set() # Ensure none of the generated sentences are the same
    while len(sentences) < num_sentences:
        unique_names = random.sample(names, k=4)
        temp_template = template
        sentence_dict = {}
        for i, name in enumerate(unique_names, start=1):
            temp_template = temp_template.replace(f"[S{i}]", name)
            sentence_dict[f'S{i}'] = name
        sentence_dict['text'] = temp_template
        if sentence_dict['text'] not in generated_set:
            generated_set.add(sentence_dict['text'])
            sentences.append(sentence_dict)
    return sentences

In [7]:
class Dataset:
    def __init__(self, ioi_prompts, tokenizer, N):
        self.ioi_prompts = ioi_prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.ioi_prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {}
        for subj in ["S1", "S2", "S3", "S4"]:
            subj_lst = []
            for prompt in self.ioi_prompts:
                input_text = prompt["text"]
                if subj != "S1":  # b/c first S1 is first token, which doesn't have space
                    target_token = "Ġ" + prompt[subj]
                else:
                    target_token = prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

        subj_lst = []
        for prompt in self.ioi_prompts:
            input_text = prompt["text"]

            tokens = self.tokenizer.tokenize(input_text)

            end_token_index = len(tokens) - 1
            subj_lst.append(end_token_index)
        self.word_idx["end"] = torch.tensor(subj_lst)

    def __len__(self):
        return self.N

In [8]:
template = "[S1] is a teacher. [S2] is a student. The child is [S2]. [S3] is a teacher. [S4] is a student. The child is"
N=10
latestS_prompts = make_latestS_prompts(names, template, N)
dataset = Dataset(latestS_prompts, model.tokenizer, N)

# Copy score

In [9]:
def check_copy_circuit_2(model, layer, head, ioi_dataset, verbose=False, neg=False, print_tokens=True):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    S_pred_tokens = {}
    subjects_moved = []
    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["S1", "S2", "S3", "S4"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            S_pred_tokens[prompt[word]] = pred_tokens
            if " " + prompt[word] in pred_tokens:
                n_right += 1
                subjects_moved.append(prompt[word])
    percent_right = (n_right / (ioi_dataset.N * 4)) * 100
    print(f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"  )
    if print_tokens == True:
        return S_pred_tokens
    else:
        return subjects_moved

## Get important heads

Find what heads are specific to certain inputs, and what's common to the template.

Get important heads from:

simple_analogies_circuits.ipynb

https://colab.research.google.com/drive/1mhcgx2SU3GrDq3pMZp_-JPtE_fO-7kGg#scrollTo=_ChKijEui-KV&line=3&uniqifier=1

most_recent_S_attn_pat.ipynb

https://colab.research.google.com/drive/1KaqcS92-BI4FZ7m-r8rCW9tIovxA_s93#scrollTo=VcFgqbcF4YvI

Positives (blue) List:

L8, H11

L9, H9



```
Top value 1: Row=9, Column=9, Value=2.5532712936401367
Top value 2: Row=8, Column=11, Value=2.1216535568237305
Top value 3: Row=10, Column=6, Value=1.6274135112762451
Top value 4: Row=11, Column=1, Value=0.37464624643325806
Top value 5: Row=8, Column=6, Value=0.36867403984069824
```

In [None]:
top_val = [(9, 9), (8, 11), (10, 6), (11, 1), (8, 6), (9, 2), (2, 10), (11, 8), (8, 8), (5, 11)]
for index, (layer, head) in enumerate(top_val):
    print(index, check_copy_circuit_2(model, layer, head, dataset, print_tokens=False))

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 100.0%
0 ['Jacob', 'David', 'Christopher', 'Tyler', 'Sarah', 'Jason', 'Brandon', 'James', 'Tyler', 'Daniel', 'Sean', 'Michelle', 'Joshua', 'Emily', 'Thomas', 'Jessica', 'Christopher', 'Joshua', 'Ryan', 'Brandon', 'Amy', 'Aaron', 'Christopher', 'Jeremy', 'Andrew', 'Kelly', 'Stephen', 'Jamie', 'Michelle', 'Steven', 'Brandon', 'Kyle', 'Emily', 'Sarah', 'Jose', 'Michael', 'Scott', 'Jose', 'John', 'Jamie']
Copy circuit for head 8.11 (sign=1) : Top 5 accuracy: 92.5%
1 ['David', 'Christopher', 'Tyler', 'Sarah', 'Jason', 'Brandon', 'James', 'Tyler', 'Daniel', 'Sean', 'Michelle', 'Joshua', 'Emily', 'Thomas', 'Jessica', 'Christopher', 'Joshua', 'Ryan', 'Brandon', 'Amy', 'Aaron', 'Christopher', 'Jeremy', 'Kelly', 'Stephen', 'Jamie', 'Michelle', 'Steven', 'Brandon', 'Kyle', 'Emily', 'Sarah', 'Jose', 'Michael', 'Jose', 'John', 'Jamie']
Copy circuit for head 10.6 (sign=1) : Top 5 accuracy: 57.49999999999999%
2 ['David', 'Jason', 'Brandon', 'Daniel

Look at heads with neg (red) on heatmap

https://colab.research.google.com/drive/1KaqcS92-BI4FZ7m-r8rCW9tIovxA_s93#scrollTo=_ChKijEui-KV&line=1&uniqifier=1

In [None]:
check_copy_circuit_2(model, 10, 10, dataset)

Copy circuit for head 10.10 (sign=1) : Top 5 accuracy: 37.5%


{'Thomas': [' v', ' Tiffany', ' V', ' td', ' Tom'],
 'Anthony': [' AE', ' EP', ' Anthony', ' AM', 'Al'],
 'Sarah': [' Sarah', 'Raven', ' Su', 'Sarah', 'rad'],
 'Robert': [' RC', ' R', ' Ro', ' RU', ' Re'],
 'Ryan': [' v', ' r', 'v', ' Rh', ' V'],
 'Scott': ['SC', 'sc', ' Scott', ' SC', ' ST'],
 'Daniel': [' Dan', ' d', ' Dar', ' D', ' Cort'],
 'Joshua': [' Ju', ' Kw', ' ju', ' Joshua', ' Kl'],
 'Adam': [' AD', ' Ad', 'AD', ' adam', ' Alb'],
 'Eric': [' cl', ' Re', ' Cl', ' v', ' CL'],
 'Aaron': [' EL', ' el', ' El', ' Elk', 'El'],
 'Justin': [' cl', ' Justin', ' CL', 'Cl', 'cl'],
 'Jose': [' J', ' ju', ' j', ' Jose', 'J'],
 'Jessica': [' j', ' J', ' v', 'j', 'v'],
 'Paul': [' EP', ' PP', 'Ps', 'P', 'p'],
 'Rachel': [' Rachel', ' R', ' Raz', ' Rub', ' RT'],
 'Michelle': [' Ma', ' Br', 'M', 'Br', 'gem'],
 'David': [' DeV', ' Dom', ' Dev', ' De', ' V'],
 'Jeremy': [' FC', ' Jeremy', ' Jaw', ' Blair', ' EC'],
 'William': [' h', ' v', ' Rh', 'h', ' Hu'],
 'Jacob': [' Jacob', ' J', 'jac', ' 

In [None]:
check_copy_circuit_2(model, 10, 0, dataset)

Copy circuit for head 10.0 (sign=1) : Top 5 accuracy: 85.0%


{'Thomas': [' Ingram', ' Payton', ' Charlotte', ' Newton', ' Noel'],
 'Anthony': [' Anthony', 'Anthony', ' Jarvis', ' Sauce', ' sauces'],
 'Sarah': [' Sarah', 'Sarah', ' Palin', 'arah', ' Anchorage'],
 'Robert': [' Cumber', ' Robert', 'Robert', ' Alexandria', ' Lann'],
 'Ryan': [' thrott', ' Ryan', ' Jet', ' HI', ' Haj'],
 'Scott': ['Scott', ' Scott', ' Aberdeen', ' Glasgow', ' Antarctic'],
 'Daniel': [' Daniel', 'Daniel', ' Razor', 'aniel', ' dab'],
 'Joshua': [' Joshua', 'Joshua', ' Israeli', ' Torah', ' Jericho'],
 'Adam': [' Adam', 'Adam', ' Eid', ' Danish', ' Goat'],
 'Eric': [' Noel', ' MV', ' jQuery', ' Ingram', ' Payton'],
 'Aaron': [' Aaron', 'Aaron', ' SSH', ' Tel', ' WI'],
 'Justin': [' Justin', 'Justin', ' Bieber', ' Byzantine', ' Tooth'],
 'Jose': ['Jose', ' Jose', ' Diaz', ' Fernandez', ' Spani'],
 'Jessica': [' Jessica', 'Jessica', ' Kenyan', ' Hawaii', ' Samoa'],
 'Paul': [' Paul', ' Corinthians', 'Paul', ' sermon', ' Huckabee'],
 'Rachel': [' Rachel', 'Rachel', ' Bryan

Look at neutral heads

In [None]:
check_copy_circuit_2(model, 9, 0, dataset)

Copy circuit for head 9.0 (sign=1) : Top 5 accuracy: 15.0%


{'Thomas': ['obos', ' Rag', 'jad', ' Consortium', 'rett'],
 'Anthony': ['�', ' Wheeler', ' ping', 'atar', ' Weiner'],
 'Sarah': [' experiment', ' flour', ' ink', ' printing', 'urrency'],
 'Robert': [' client', ' works', ' work', 'work', ' reopen'],
 'Ryan': ['obos', 'orno', 'ovan', ' Rag', 'FML'],
 'Scott': ['jet', ' aspir', ' run', ' fluid', 'redits'],
 'Daniel': [' dra', ' drafts', ' mur', ' Draft', ' draft'],
 'Joshua': [' enlarge', ' PH', ' bloss', ' instrument', ' snap'],
 'Adam': ['usercontent', ' Ratings', 'merce', ' rating', ' Creator'],
 'Eric': [' bands', ' Strauss', ' Tanz', ' Rhodes', ' sampled'],
 'Aaron': [' Aaron', ' terminal', 'elin', ' cables', ' network'],
 'Justin': ['agram', ' Gram', 'phabet', ' Album', ' Spotify'],
 'Jose': [' schema', 'Serv', ' tables', 'Sche', ' sche'],
 'Jessica': ['�', 'FML', ' Wa', ' Pai', 'ansky'],
 'Paul': ['eller', ' wa', ' buses', ' mur', 'clair'],
 'Rachel': [' show', ' Rachel', ' episode', 'Rachel', 'Show'],
 'Michelle': [' garment', 'fl

In [None]:
check_copy_circuit_2(model, 9, 1, dataset)

Copy circuit for head 9.1 (sign=1) : Top 5 accuracy: 0.0%


{'Thomas': [' fifth', ' fourth', 'Fourth', 'fourth', 'fifth'],
 'Anthony': [' fourth', ' best', 'ities', 'Fourth', '�'],
 'Sarah': ['iscons', 'tel', 'rak', ' those', 'de'],
 'Robert': [' Springer', 'URA', 'shaw', 'schild', ' Vest'],
 'Ryan': [' seventh', ' once', 'ISC', ' single', 'iasis'],
 'Scott': ['steen', 'アル', 'sen', 'VG', 'VA'],
 'Daniel': [' 48', 'hib', ' dub', ' specific', 'etti'],
 'Joshua': [' Many', ' 71', ' No', 'Many', 'nos'],
 'Adam': ['haw', 'ember', '\\":', 'bp', ' 67'],
 'Eric': ['single', ' only', 'Single', ' single', 'Only'],
 'Aaron': ['152', '*)', 'actic', ' 144', ' 31'],
 'Justin': ['most', ' 4', 'hes', ' 3', 'almost'],
 'Jose': [' sed', 'thro', 'ESCO', 'ela', '20439'],
 'Jessica': [' particular', ' special', ' specific', 'itates', 'icka'],
 'Paul': [' 95', ' 94', ' rest', ' full', 'Full'],
 'Rachel': [' 48', 'ي', 'hari', ' 97', 'leck'],
 'Michelle': [' Nos', ' may', 'het', 'utterstock', ' breat'],
 'David': [' once', 'eme', 'clusively', 'ISC', 'otta'],
 'Jeremy'

# Writing direction results with scatterplot

In [10]:
def scatter_attention_and_contribution(
    model,
    layer_no,
    head_no,
    ioi_dataset,
    return_vals=False,
    return_fig=False,
):
    """
    Plot a scatter plot
    for each input sequence with the attention paid to S
    and the amount that is written in the S directions
    """

    n_heads = model.cfg.n_heads
    n_layers = model.cfg.n_layers
    model_unembed = model.unembed.W_U.detach().cpu()
    df = []
    cache = {}
    model.cache_all(cache)

    logits = model(ioi_dataset.toks.long())

    for i, prompt in enumerate(ioi_dataset.ioi_prompts):

        s1_tok = model.tokenizer(prompt["S1"])["input_ids"][0]
        s2_tok = model.tokenizer(" " + prompt["S2"])["input_ids"][0]
        s3_tok = model.tokenizer(" " + prompt["S3"])["input_ids"][0]
        s4_tok = model.tokenizer(" " + prompt["S4"])["input_ids"][0]

        toks = model.tokenizer(prompt["text"])["input_ids"]
        s1_pos = toks.index(s1_tok)
        s2_pos = toks.index(s2_tok)
        s3_pos = toks.index(s3_tok)
        s4_pos = toks.index(s4_tok)

        s1_dir = model_unembed[:, s1_tok].detach()
        s2_dir = model_unembed[:, s2_tok].detach()
        s3_dir = model_unembed[:, s3_tok].detach()
        s4_dir = model_unembed[:, s4_tok].detach()

        # model.reset_hooks() # should allow things to be done with ablated models

        for dire, posses, tok_type in [
            (s1_dir, [s1_pos], "S1"),
            (s2_dir, [s2_pos], "S2"),
            (s3_dir, [s3_pos], "S3"),
            (s4_dir, [s4_pos], "S4"),
        ]:
            prob = sum(
                [
                    cache[f"blocks.{layer_no}.attn.hook_attn"][
                        i, head_no, ioi_dataset.word_idx["end"][i], pos
                    ]
                    .detach()
                    .cpu()
                    for pos in posses
                ]
            )
            resid = (
                cache[f"blocks.{layer_no}.attn.hook_result"][
                    i, ioi_dataset.word_idx["end"][i], head_no, :
                ]
                .detach()
                .cpu()
            )
            dot = torch.einsum("a,a->", resid, dire)
            df.append([prob, dot, tok_type, prompt["text"]])

    # most of the pandas stuff is intuitive, no need to deeply understand
    viz_df = pd.DataFrame(
        df, columns=[f"Attn Prob on Name", f"Dot w Name Embed", "Name Type", "text"]
    )
    fig = px.scatter(
        viz_df,
        x=f"Attn Prob on Name",
        y=f"Dot w Name Embed",
        color="Name Type",
        hover_data=["text"],
        # color_discrete_sequence=["rgb(114,255,100)", "rgb(201,165,247)"],
        title=f"How Strong {layer_no}.{head_no} Writes in the Name Embed Direction Relative to Attn Prob",
    )

    if return_vals:
        return viz_df
    if return_fig:
        return fig
    else:
        fig.show()

In [30]:
scatter_attention_and_contribution(
    model=model, layer_no=9, head_no=9, ioi_dataset=dataset
)

In [18]:
scatter_attention_and_contribution(
    model=model, layer_no=10, head_no=7, ioi_dataset=dataset
)

## Correlation vals

In [13]:
def get_prob_dot(
    model,
    layer_no,
    head_no,
    ioi_dataset,
    return_vals=False,
    return_fig=False,
):
    """
    Plot a scatter plot
    for each input sequence with the attention paid to S
    and the amount that is written in the S directions
    """

    n_heads = model.cfg.n_heads
    n_layers = model.cfg.n_layers
    model_unembed = model.unembed.W_U.detach().cpu()
    # df = []
    all_prob = []
    all_dot = []
    cache = {}
    model.cache_all(cache)

    logits = model(ioi_dataset.toks.long())

    for i, prompt in enumerate(ioi_dataset.ioi_prompts):

        s1_tok = model.tokenizer(prompt["S1"])["input_ids"][0]
        s2_tok = model.tokenizer(" " + prompt["S2"])["input_ids"][0]
        s3_tok = model.tokenizer(" " + prompt["S3"])["input_ids"][0]
        s4_tok = model.tokenizer(" " + prompt["S4"])["input_ids"][0]

        toks = model.tokenizer(prompt["text"])["input_ids"]
        s1_pos = toks.index(s1_tok)
        s2_pos = toks.index(s2_tok)
        s3_pos = toks.index(s3_tok)
        s4_pos = toks.index(s4_tok)

        s1_dir = model_unembed[:, s1_tok].detach()
        s2_dir = model_unembed[:, s2_tok].detach()
        s3_dir = model_unembed[:, s3_tok].detach()
        s4_dir = model_unembed[:, s4_tok].detach()

        # model.reset_hooks() # should allow things to be done with ablated models

        for dire, posses, tok_type in [
            (s1_dir, [s1_pos], "S1"),
            (s2_dir, [s2_pos], "S2"),
            (s3_dir, [s3_pos], "S3"),
            (s4_dir, [s4_pos], "S4"),
        ]:
            prob = sum(
                [
                    cache[f"blocks.{layer_no}.attn.hook_attn"][
                        i, head_no, ioi_dataset.word_idx["end"][i], pos
                    ]
                    .detach()
                    .cpu()
                    for pos in posses
                ]
            )
            resid = (
                cache[f"blocks.{layer_no}.attn.hook_result"][
                    i, ioi_dataset.word_idx["end"][i], head_no, :
                ]
                .detach()
                .cpu()
            )
            dot = torch.einsum("a,a->", resid, dire)
            # df.append([prob, dot, tok_type, prompt["text"]])
            all_prob.append(prob)
            all_dot.append(dot)

    return all_prob, all_dot

In [14]:
all_prob, all_dot = get_prob_dot(
    model=model, layer_no=9, head_no=9, ioi_dataset=dataset
)

In [16]:
import scipy.stats as stats

# X and Y should be arrays, lists, or pandas Series
correlation, p_value = stats.pearsonr(all_prob, all_dot)

print("Correlation:", correlation)
print("p-value:", p_value)

Correlation: 0.9373701261699992
p-value: 5.391151668264094e-19
