# Setup

In [None]:
%%capture
%pip install git+https://github.com/redwoodresearch/Easy-Transformer.git
%pip install einops datasets transformers fancy_einsum

In [None]:
from copy import deepcopy
import torch

assert torch.cuda.device_count() == 1
from tqdm import tqdm
import pandas as pd
import torch
import torch as t
from easy_transformer.EasyTransformer import (
    EasyTransformer,
)
from time import ctime
from functools import partial

import numpy as np
from tqdm import tqdm
import pandas as pd

from easy_transformer.experiments import (
    ExperimentMetric,
    AblationConfig,
    EasyAblation,
    EasyPatching,
    PatchingConfig,
)
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import random
import einops
from IPython import get_ipython
from copy import deepcopy
from easy_transformer.ioi_dataset import (
    IOIDataset,
)
from easy_transformer.ioi_utils import (
    path_patching,
    max_2d,
    CLASS_COLORS,
    show_pp,
    show_attention_patterns,
    scatter_attention_and_contribution,
)
from random import randint as ri
from easy_transformer.ioi_circuit_extraction import (
    do_circuit_extraction,
    get_heads_circuit,
    CIRCUIT,
)
from easy_transformer.ioi_utils import logit_diff, probs
from easy_transformer.ioi_utils import get_top_tokens_and_probs as g

ipython = get_ipython()
if ipython is not None:
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

 Initialise model (use larger N or fewer templates for no warnings about in-template ablation)

In [None]:
model = EasyTransformer.from_pretrained("gpt2-large").cuda()
model.set_use_attn_result(True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Moving model to device:  cuda
Finished loading pretrained model gpt2-large into EasyTransformer!


# Get sigf attention heads

In [None]:
# Janky code to do different setup when run in a Colab notebook vs VSCode
DEBUG_MODE = False
try:
    import google.colab
    IN_COLAB = True
    print("Running as a Colab notebook")
    %pip install git+https://github.com/neelnanda-io/TransformerLens.git
    # Install another version of node that makes PySvelte work way faster
    !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs
    %pip install git+https://github.com/neelnanda-io/PySvelte.git
except:
    IN_COLAB = False
    print("Running as a Jupyter notebook - intended for development only!")
    from IPython import get_ipython

    ipython = get_ipython()
    # Code to automatically update the HookedTransformer code as its edited without restarting the kernel
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

In [None]:
import einops

import pysvelte

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookedRootModule,
    HookPoint,
)  # Hooking utilities
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [None]:
prompts = [
"John is tall. Mary is",
]
# List of answers, in the format (correct, incorrect)
answers = [
    (" short", " tall"),
]

answer_tokens = []
for answer in answers:
    correct_token = model.to_single_token(answer[0])
    incorrect_token = model.to_single_token(answer[1])
    answer_tokens.append((correct_token, incorrect_token))
# answer_tokens = torch.tensor(answer_tokens)
answer_tokens = torch.tensor(answer_tokens).cuda()

tokens = model.to_tokens(prompts, prepend_bos=True)
tokens = tokens.cuda() # Move the tokens to the GPU
original_logits, cache = model.run_with_cache(tokens) # Run the model and cache all activations

def logits_to_ave_logit_diff(logits, answer_tokens, per_prompt=False):
    # Only the final logits are relevant for the answer
    final_logits = logits[:, -1, :]
    answer_logits = final_logits.gather(dim=-1, index=answer_tokens)
    answer_logit_diff = answer_logits[:, 0] - answer_logits[:, 1]
    if per_prompt:
        return answer_logit_diff
    else:
        return answer_logit_diff.mean()

# print("Per prompt logit difference:", logits_to_ave_logit_diff(original_logits, answer_tokens, per_prompt=True))
original_average_logit_diff = logits_to_ave_logit_diff(original_logits, answer_tokens)
# print("Average logit difference:", logits_to_ave_logit_diff(original_logits, answer_tokens).item())

In [None]:
answer_residual_directions = model.tokens_to_residual_directions(answer_tokens)
print("Answer residual directions shape:", answer_residual_directions.shape)
logit_diff_directions = answer_residual_directions[:, 0] - answer_residual_directions[:, 1]
print("Logit difference directions shape:", logit_diff_directions.shape)

# cache syntax - resid_post is the residual stream at the end of the layer, -1 gets the final layer. The general syntax is [activation_name, layer_index, sub_layer_type]. 
final_residual_stream = cache["resid_post", -1]
print("Final residual stream shape:", final_residual_stream.shape)
final_token_residual_stream = final_residual_stream[:, -1, :]
# Apply LayerNorm scaling
# pos_slice is the subset of the positions we take - here the final token of each prompt
scaled_final_token_residual_stream = cache.apply_ln_to_stack(final_token_residual_stream, layer = -1, pos_slice=-1)

average_logit_diff = einsum("batch d_model, batch d_model -> ", scaled_final_token_residual_stream, logit_diff_directions)/len(prompts)
print("Calculated average logit diff:", average_logit_diff.item())
print("Original logit difference:",original_average_logit_diff.item())

def residual_stack_to_logit_diff(residual_stack: Float[torch.Tensor, "components batch d_model"], cache: ActivationCache) -> float:
    scaled_residual_stack = cache.apply_ln_to_stack(residual_stack, layer = -1, pos_slice=-1)
    return einsum("... batch d_model, batch d_model -> ...", scaled_residual_stack, logit_diff_directions)/len(prompts)
accumulated_residual, labels = cache.accumulated_resid(layer=-1, incl_mid=True, pos_slice=-1, return_labels=True)
logit_lens_logit_diffs = residual_stack_to_logit_diff(accumulated_residual, cache)
# line(logit_lens_logit_diffs, x=np.arange(model.cfg.n_layers*2+1)/2, hover_name=labels, title="Logit Difference From Accumulate Residual Stream")

AttributeError: ignored

In [None]:
per_head_residual, labels = cache.stack_head_results(layer=-1, pos_slice=-1, return_labels=True)
per_head_logit_diffs = residual_stack_to_logit_diff(per_head_residual, cache)
per_head_logit_diffs = einops.rearrange(per_head_logit_diffs, "(layer head_index) -> layer head_index", layer=model.cfg.n_layers, head_index=model.cfg.n_heads)
imshow(per_head_logit_diffs, labels={"x":"Head", "y":"Layer"}, title="Logit Difference From Each Head")

# Dataset of Prompts

In [None]:
# N = 50
# ioi_dataset = IOIDataset(
#     prompt_type="mixed",
#     N=N,
#     tokenizer=model.tokenizer,
#     prepend_bos=False,
# )  # TODO make this a seeded dataset

# print(f"Here are two of the prompts from the dataset: {ioi_dataset.sentences[:2]}")



Here are two of the prompts from the dataset: ['Then, Matthew and Robert had a lot of fun at the school. Robert gave a ring to Matthew', 'Then, Nicholas and Vanessa were working at the house. Vanessa decided to give a basketball to Nicholas']


https://github.com/redwoodresearch/Easy-Transformer/blob/main/easy_transformer/ioi_dataset.py

See:
class IOIDataset:
...

elif isinstance(prompt_type, list):
    self.templates = prompt_type

prompt type is required, and this is how it uses custom

In [None]:
N=1
custom_templates = [
    " [A] is tall. [B] is",
]
dataset = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)



In [None]:
dataset.sentences[:3]

[' Matthew is tall. Alicia is']

In [None]:
def logit_diff(
    model,
    ioi_dataset,
    all=False,
    std=False,
    both=False,
):  # changed by Arthur to take dataset object, :pray: no big backwards compatibility issues
    """
    Difference between the IO and the S logits at the "to" token
    """

    logits = model(ioi_dataset.toks.long()).detach()

    # uhhhh, I guess logit sum is constatn, but the constant is -516763 which seems weird (not 0?)
    # end_logits = logits[torch.arange(ioi_dataset.N), ioi_dataset.word_idx["end"], :]
    # assert len(end_logits.shape) == 2, end_logits.shape
    # assert torch.allclose(end_logits[0], end_logits[0] * 0.0)
    # for i in range(10):
    #     print(torch.sum(end_logits[i]))

    IO_logits = logits[
        torch.arange(len(ioi_dataset)),
        ioi_dataset.word_idx["end"],
        ioi_dataset.io_tokenIDs,
    ]
    S_logits = logits[
        torch.arange(len(ioi_dataset)),
        ioi_dataset.word_idx["end"],
        ioi_dataset.s_tokenIDs,
    ]

    if both:
        return handle_all_and_std(IO_logits, all, std), handle_all_and_std(
            S_logits, all, std
        )

    else:
        return handle_all_and_std(IO_logits - S_logits, all, std)


In [None]:
def logits_to_ave_logit_diff(logits, answer_tokens, per_prompt=False):
    # Only the final logits are relevant for the answer
    final_logits = logits[:, -1, :]
    answer_logits = final_logits.gather(dim=-1, index=answer_tokens)
    answer_logit_diff = answer_logits[:, 0] - answer_logits[:, 1]
    if per_prompt:
        return answer_logit_diff
    else:
        return answer_logit_diff.mean()

In [None]:
model_logit_diff = logit_diff(model, dataset)
model_io_probs = probs(model, dataset)
print(
    f"The model gets average logit difference {model_logit_diff.item()} over {N} examples"
)
print(f"The model gets average IO probs {model_io_probs.item()} over {N} examples")

The model gets average logit difference -1.1923670768737793 over 1 examples
The model gets average IO probs 0.0001318493450526148 over 1 examples


In [None]:
CIRCUIT

{'name mover': [(9, 9),
  (10, 0),
  (9, 6),
  (10, 10),
  (10, 6),
  (10, 2),
  (10, 1),
  (11, 2),
  (9, 7),
  (9, 0),
  (11, 9)],
 'negative': [(10, 7), (11, 10)],
 's2 inhibition': [(7, 3), (7, 9), (8, 6), (8, 10)],
 'induction': [(5, 5), (5, 8), (5, 9), (6, 9)],
 'duplicate token': [(0, 1), (0, 10), (3, 0)],
 'previous token': [(2, 2), (4, 11)]}

In [None]:
CIRCUIT_NEW = {'adjective mover': [(30, 13)]}

Add hooks to the model to obtain intermediate activations when running path patching, copy scores, etc

In [None]:
circuit = deepcopy(CIRCUIT)

# we make the ABC dataset in order to knockout other model components
# abc_dataset = (  # TODO seeded
#     ioi_dataset.gen_flipped_prompts(("IO", "RAND"))
#     .gen_flipped_prompts(("S", "RAND"))
#     .gen_flipped_prompts(("S1", "RAND"))
# )
custom_templates_2 = [
    " [A] is short. [B] is",
]
corr_dataset = IOIDataset(prompt_type=custom_templates_2, N=N, tokenizer=model.tokenizer, prepend_bos=False)

# we then add hooks to the model to knockout all the heads except the circuit
model.reset_hooks()
model, _ = do_circuit_extraction(
    model=model,
    heads_to_keep=get_heads_circuit(ioi_dataset=dataset, circuit=circuit),
    mlps_to_remove={},
    ioi_dataset=dataset,
    mean_dataset=corr_dataset,
)

circuit_logit_diff = logit_diff(model, dataset)
print(
    f"The circuit gets average logit difference {circuit_logit_diff.item()} over {N} examples"
)


S2 index has been computed as the same for S and S2



The circuit gets average logit difference -0.6099258661270142 over 10 examples


# Path patching

Iterates over each layer

In [None]:
def plot_path_patching(
    model,
    ioi_dataset,
    receiver_hooks,  # list of tuples (hook_name, idx). If idx is not None, then at dim 2 index in with idx (used for doing things for specific attention heads)
    position,
):
    model.reset_hooks()
    default_logit_diff = logit_diff(model, ioi_dataset)
    results = torch.zeros(size=(30, 30))  # change this to 30 x 30 for large
    mlp_results = torch.zeros(size=(30, 1))
    for source_layer in tqdm(range(30)):
        for source_head_idx in [None] + list(range(30)):
            model.reset_hooks()

            model = path_patching(
                model=model,
                D_new=corr_dataset,
                D_orig=ioi_dataset,
                sender_heads=[(source_layer, source_head_idx)],
                receiver_hooks=receiver_hooks,
                positions=[position],
                return_hooks=False,
                freeze_mlps=False,
                have_internal_interactions=False,
            )
            cur_logit_diff = logit_diff(model, ioi_dataset)

            if source_head_idx is None:
                mlp_results[source_layer] = cur_logit_diff - default_logit_diff
            else:
                results[source_layer][source_head_idx] = (
                    cur_logit_diff - default_logit_diff
                )

            if source_layer == 1:
                assert not torch.allclose(results, 0.0 * results), results

            if source_layer == 29 and source_head_idx == 29:  # chagne to 29 for large
                results /= default_logit_diff
                mlp_results /= default_logit_diff

                results *= 100
                mlp_results *= 100

                # show attention head results
                fig = show_pp(
                    results,
                    title=f"Effect of patching (Heads->Final Residual Stream State) path",
                    return_fig=True,
                    show_fig=False,
                    bartitle="% change in logit difference",
                )
                fig.show()


plot_path_patching(
    model,
    dataset,
    receiver_hooks=[(f"blocks.{model.cfg.n_layers-1}.hook_resid_post", None)],
    position="end",
)

# Copy score
Try modifying original using chatgpt

## Fix bug with chatgpt code

In [None]:
def check_copy_circuit(model, layer, head, ioi_dataset, verbose=False, neg=False):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    for seq_idx in range(ioi_dataset.N):
        pred_tokens = [
            model.tokenizer.decode(token)
            for token in torch.topk(logits[seq_idx], k).indices
        ]
        if "short" in pred_tokens:
            n_right += 1
        else:
            if verbose:
                print("-------")
                print("Seq: " + ioi_dataset.sentences[seq_idx])
                print("Predictions:")
                print(
                    " ".join(
                        [
                            f"({i+1}):{model.tokenizer.decode(token)}"
                            for i, token in enumerate(
                                torch.topk(logits[seq_idx], k).indices
                            )
                        ]
                    )
                )

    percent_right = (n_right / ioi_dataset.N) * 100
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    )
    return percent_right, pred_tokens


In [None]:
ioi_dataset = dataset
neg = False
layer = 30
head = 13
verbose=True

cache = {}
model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
model(ioi_dataset.toks.long())
if neg:
    sign = -1
else:
    sign = 1
z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
logits = model.unembed(model.ln_final(o))

k = 5
n_right = 0

for seq_idx in range(ioi_dataset.N):
    pred_tokens = [
        model.tokenizer.decode(token)
        for token in torch.topk(logits[seq_idx], k).indices
    ]
    if "short" in pred_tokens:
        n_right += 1
    else:
        if verbose:
            print("-------")
            print("Seq: " + ioi_dataset.sentences[seq_idx])
            print("Predictions:")
            print(
                " ".join(
                    [
                        f"({i+1}):{model.tokenizer.decode(token)}"
                        for i, token in enumerate(
                            torch.topk(logits[seq_idx], k).indices
                        )
                    ]
                )
            )

percent_right = (n_right / ioi_dataset.N) * 100
print(
    f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
)


-------
Seq:  Matthew is tall. Alicia is
Predictions:
(1): Chun Kinnigon Indigo Veg (2):session Pax Bett Holo Palmer (3): Lankaahime Kore Albion Libya (4): Punch Parks Carey Parisaq (5):PH plun Kern blockers facilit (6): Cindy Devi 裏� commission Pai
Copy circuit for head 30.13 (sign=1) : Top 5 accuracy: 0.0%


In [None]:
z_0.shape

torch.Size([1, 6, 1280])

In [None]:
v.shape

torch.Size([1, 6, 64])

In [None]:
o.shape

torch.Size([1, 6, 1280])

In [None]:
logits.shape

torch.Size([1, 6, 50257])

In [None]:
torch.topk(logits[seq_idx], k).indices.shape

torch.Size([6, 5])

In [None]:
logits[seq_idx].shape

torch.Size([6, 50257])

In [None]:
dataset.N

1

In [None]:
check_copy_circuit(model, 30, 13, dataset, neg=neg_sign)

Copy circuit for head 30.13 (sign=1) : Top 5 accuracy: 0.0%


(0.0,
 [' Chun Kinnigon Indigo Veg',
  'session Pax Bett Holo Palmer',
  ' Lankaahime Kore Albion Libya',
  ' Punch Parks Carey Parisaq',
  'PH plun Kern blockers facilit',
  ' Cindy Devi 裏� commission Pai'])

In [None]:
check_copy_circuit(model, 20, 14, dataset, neg=neg_sign)

Copy circuit for head 20.14 (sign=1) : Top 5 accuracy: 0.0%


(0.0,
 [' Fior Alic Flan Swanson Arlington',
  ' SPR HutchEH HOL ###',
  'musNE Rath limb IX',
  ' Paula wives Dante Spac Paradise',
  'eneg Bre Fram Membos',
  'ighton ###sdckingicial'])

What is the original outputting?

In [None]:
N = 1
ioi_dataset = IOIDataset(
    prompt_type="mixed",
    N=N,
    tokenizer=model.tokenizer,
    prepend_bos=False,
)  



In [None]:
model_small = EasyTransformer.from_pretrained("gpt2").cuda()
model_small.set_use_attn_result(True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Moving model to device:  cuda
Finished loading pretrained model gpt2 into EasyTransformer!


In [None]:
def check_copy_circuit(model, layer, head, ioi_dataset, verbose=False, neg=False):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["IO", "S", "S2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            if "S" in word:
                name = "S"
            else:
                name = word
            if " " + prompt[name] in pred_tokens:
                n_right += 1
            else:
                if verbose:
                    print("-------")
                    print("Seq: " + ioi_dataset.sentences[seq_idx])
                    print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
                    print(
                        " ".join(
                            [
                                f"({i+1}):{model.tokenizer.decode(token)}"
                                for i, token in enumerate(
                                    torch.topk(
                                        logits[
                                            seq_idx, ioi_dataset.word_idx[word][seq_idx]
                                        ],
                                        k,
                                    ).indices
                                )
                            ]
                        )
                    )
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    # print(
    #     f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    # )
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : {pred_tokens}%"
    )
    return percent_right, pred_tokens

In [None]:
check_copy_circuit(model_small, 9, 9, ioi_dataset, neg=neg_sign)

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 100.0%


(100.0, [' Vanessa', 'VA', 'Van', 'AV', 'avi'])

In [None]:
neg = False
layer = 9
head = 9
verbose=True

cache = {}
model_small.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
model_small(ioi_dataset.toks.long())
if neg:
    sign = -1
else:
    sign = 1
z_0 = model_small.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

v = torch.einsum("eab,bc->eac", z_0, model_small.blocks[layer].attn.W_V[head])
v += model_small.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

o = sign * torch.einsum("sph,hd->spd", v, model_small.blocks[layer].attn.W_O[head])
logits = model_small.unembed(model_small.ln_final(o))

k = 5
n_right = 0

for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
    for word in ["IO", "S", "S2"]:
        pred_tokens = [
            model_small.tokenizer.decode(token)
            for token in torch.topk(
                logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
            ).indices
        ]
        if "S" in word:
            name = "S"
        else:
            name = word
        if " " + prompt[name] in pred_tokens:
            n_right += 1
        else:
            if verbose:
                print("-------")
                print("Seq: " + ioi_dataset.sentences[seq_idx])
                print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
                print(
                    " ".join(
                        [
                            f"({i+1}):{model_small.tokenizer.decode(token)}"
                            for i, token in enumerate(
                                torch.topk(
                                    logits[
                                        seq_idx, ioi_dataset.word_idx[word][seq_idx]
                                    ],
                                    k,
                                ).indices
                            )
                        ]
                    )
                )
percent_right = (n_right / (ioi_dataset.N * 3)) * 100
print(
    f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
)

Copy circuit for head 9.9 (sign=1) : Top 5 accuracy: 100.0%


In [None]:
logits.shape

torch.Size([1, 20, 50257])

In [None]:
o.shape

torch.Size([1, 20, 768])

In [None]:
v.shape

torch.Size([1, 20, 64])

In [None]:
z_0.shape

torch.Size([1, 20, 768])

In [None]:
model_small.blocks[layer].attn.W_V[head].shape

torch.Size([768, 64])

In [None]:
check_copy_circuit(model, 30, 13, dataset, neg=neg_sign)

Copy circuit for head 30.13 (sign=1) : Top 5 accuracy: 0.0%


(0.0, ['PH', ' plun', ' Kern', ' blockers', ' facilit'])

It seems the modified chatGPT code had 6 outputs b/c there were 6 attn heads, but it should only output the matrix for 1 attention head

In [None]:
check_copy_circuit(model, 20, 14, dataset, neg=neg_sign)

Copy circuit for head 20.14 (sign=1) : Top 5 accuracy: 0.0%


(0.0, ['eneg', ' Bre', ' Fram', ' Mem', 'bos'])

## Check every head in last layer to see what logits they predict

Still, none of these heads get 'short' in their top 5 tokens.

An alternative is to go through EVERY head and check if they output short.

In [None]:
for layer in range(20,30):
    for head in range(0,15):
        check_copy_circuit(model, layer, head, dataset, neg=neg_sign)

Copy circuit for head 20.0 (sign=1) : [' Amelia', ' Dian', 'Nat', ' Adin', ' Liter']%
Copy circuit for head 20.1 (sign=1) : ['aryl', 'ion', 'croft', 'ヴァ', '��']%
Copy circuit for head 20.2 (sign=1) : ['hal', ' VK', ' Bundes', ' NK', 'arb']%
Copy circuit for head 20.3 (sign=1) : [' hers', 'She', ' her', ' she', ' SHE']%
Copy circuit for head 20.4 (sign=1) : [' Alicia', ' Bian', ' Acer', ' Lara', ' Erica']%
Copy circuit for head 20.5 (sign=1) : [' slightest', ' finally', ' anything', ' resumes', ' any']%
Copy circuit for head 20.6 (sign=1) : [' Bloody', ' Creep', ' Danger', ' Fake', ' Discord']%
Copy circuit for head 20.7 (sign=1) : [' respondents', ' outl', ' findings', ' CPC', ' however']%
Copy circuit for head 20.8 (sign=1) : [' Investig', ' subpoen', ' revelations', 'Prosecut', ' pled']%
Copy circuit for head 20.9 (sign=1) : ['aneously', 'isha', ' visitors', 'young', ' toll']%
Copy circuit for head 20.10 (sign=1) : [' ank', ' Alic', 'APD', ' Ard', ' ath']%
Copy circuit for head 20.11

None of them have 'short' or 'tall' as a top prediction. These are interesting though:

Copy circuit for head 21.9 (sign=1) : [' Clintons', 'Hillary', ' Hillary', 'Clinton', ' Clinton']%

Copy circuit for head 22.9 (sign=1) : ['She', ' she', ' She', ' her', 'Her']%

Copy circuit for head 25.1 (sign=1) : [' transgender', ' gender', 'Sex', 'Gender', ' sex']%

Copy circuit for head 29.1 (sign=1) : [' I', ' II', 'I', ' III', 'III']%

Copy circuit for head 29.14 (sign=1) : [' ()', ' ().', ' (),', ' ¶', '?']%



```
Rank 1: Value=0.6968, Layer=30, Head=13
Rank 2: Value=0.4971, Layer=20, Head=14
Rank 3: Value=0.4949, Layer=25, Head=5
Rank 4: Value=0.3269, Layer=19, Head=14
Rank 5: Value=0.3089, Layer=23, Head=17
Rank 6: Value=0.2569, Layer=26, Head=0
Rank 7: Value=0.2000, Layer=24, Head=17
Rank 8: Value=0.1951, Layer=27, Head=11
Rank 9: Value=0.1836, Layer=17, Head=19
Rank 10: Value=0.1824, Layer=17, Head=0
```



Rank 1: Value=0.6968, Layer=30, Head=13

By activation patching, this head restores nearly 70% of the correct logit. So why isn't it predicting something related to "tall?"

Remember, even negative results say something. THis negatie result seems to be saying that heads are not so easily interpretable; more methods or taking more heads into account as a circuit may be needed.

In [None]:
# get list of (layer, head) which

# for each (layer, head)
    # check_copy_circuit(model, L, h, dataset, neg=neg_sign)

Output the logit value too, and do top 10

In [None]:
def check_copy_circuit(model, layer, head, ioi_dataset, verbose=False, neg=False):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 10
    n_right = 0

    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["IO", "S", "S2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            if "S" in word:
                name = "S"
            else:
                name = word
            if " " + prompt[name] in pred_tokens:
                n_right += 1
            else:
                if verbose:
                    print("-------")
                    print("Seq: " + ioi_dataset.sentences[seq_idx])
                    print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
                    print(
                        " ".join(
                            [
                                f"({i+1}):{model.tokenizer.decode(token)}"
                                for i, token in enumerate(
                                    torch.topk(
                                        logits[
                                            seq_idx, ioi_dataset.word_idx[word][seq_idx]
                                        ],
                                        k,
                                    ).indices
                                )
                            ]
                        )
                    )
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    # print(
    #     f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    # )
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : {pred_tokens}%, {logits}%, {logits.shape}%"
    )
    return percent_right, pred_tokens

In [None]:
check_copy_circuit(model, 30, 13, dataset, neg=neg_sign)

Copy circuit for head 30.13 (sign=1) : ['PH', ' plun', ' Kern', ' blockers', ' facilit', 'enger', ' GOODMAN', ' McConnell', ' Pilgrim', ' PowerShell']%, tensor([[[ 2.9020, -0.7342, -3.4412,  ...,  2.0789, -3.3292,  0.6425],
         [ 1.1795,  1.4453, -2.1333,  ...,  0.0665, -0.6496,  0.2731],
         [ 1.4899,  0.6538,  0.7683,  ..., -4.2621, -1.2492,  1.5874],
         [ 0.4905,  1.3428, -3.3533,  ..., -1.3459,  0.7554,  0.4383],
         [ 2.1308,  2.4536,  1.5559,  ...,  2.1780, -3.0354,  2.2542],
         [ 0.9814,  1.2704, -1.3613,  ...,  1.0327,  2.0711, -0.2493]]],
       device='cuda:0', grad_fn=<AddBackward0>)%, torch.Size([1, 6, 50257])%


(0.0,
 ['PH',
  ' plun',
  ' Kern',
  ' blockers',
  ' facilit',
  'enger',
  ' GOODMAN',
  ' McConnell',
  ' Pilgrim',
  ' PowerShell'])

In [None]:
check_copy_circuit(model, 20, 14, dataset, neg=neg_sign)

Copy circuit for head 20.14 (sign=1) : ['eneg', ' Bre', ' Fram', ' Mem', 'bos', ' Foss', ' Bers', ' bos', '�', ' Cook']%, tensor([[[ 1.3847,  0.4192, -1.3111,  ..., -0.0270,  0.6199,  1.1537],
         [ 1.0435,  0.7931,  0.1881,  ..., -0.9526,  0.6219,  3.6695],
         [ 0.9149,  1.3005,  3.3909,  ..., -5.3180, -1.0020,  2.4524],
         [-0.3501,  0.1262,  3.1103,  ..., -1.0750,  1.9859,  2.7349],
         [ 1.4126,  1.0801,  4.1626,  ..., -0.4339,  2.1104,  1.3815],
         [ 2.1063,  1.6959,  1.5296,  ..., -4.6622,  0.1795,  3.5070]]],
       device='cuda:0', grad_fn=<AddBackward0>)%


(0.0,
 ['eneg',
  ' Bre',
  ' Fram',
  ' Mem',
  'bos',
  ' Foss',
  ' Bers',
  ' bos',
  '�',
  ' Cook'])

The reason the chatgpt code returned token interpretations for all 6 tokens ("John is tall. Mary is"; '.' is a token) is because it did not use the variable 'word' in its inner loop, thus it did not specify:

logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]]

but instead did:

logits[seq_idx]

It is paramount to use [word] to get the logits for the word we want. This differs from copy because the word we want isn't copied within the input.

Perhaps we should input the full sentence "John is tall. Mary is short" instead?

In [None]:
N=1
custom_templates = [
    " [A] is tall. [B] is short",
]
dataset_3 = IOIDataset(prompt_type=custom_templates, N=N, tokenizer=model.tokenizer, prepend_bos=False)



In [None]:
def check_copy_circuit(model, layer, head, ioi_dataset, seq_ind, verbose=False, neg=False):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    for seq_idx, prompt in enumerate(ioi_dataset.ioi_prompts):
        for word in ["IO", "S", "S2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    # logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                    logits[seq_idx, seq_ind], k
                ).indices
            ]
            # if "S" in word:
            #     name = "S"
            # else:
            #     name = word
            # if " " + prompt[name] in pred_tokens:
            #     n_right += 1
            # else:
            #     if verbose:
            #         print("-------")
            #         print("Seq: " + ioi_dataset.sentences[seq_idx])
            #         print("Target: " + ioi_dataset.ioi_prompts[seq_idx][name])
            #         print(
            #             " ".join(
            #                 [
            #                     f"({i+1}):{model.tokenizer.decode(token)}"
            #                     for i, token in enumerate(
            #                         torch.topk(
            #                             logits[
            #                                 seq_idx, ioi_dataset.word_idx[word][seq_idx]
            #                             ],
            #                             k,
            #                         ).indices
            #                     )
            #                 ]
            #             )
            #         )
    percent_right = (n_right / (ioi_dataset.N * 3)) * 100
    # print(
    #     f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"
    # )
    print(
        f"Copy circuit for head {layer}.{head} (sign={sign}) : {pred_tokens}%, {logits}%, {logits.shape}%"
    )
    return percent_right, pred_tokens

In [None]:
check_copy_circuit(model, 30, 13, dataset_3, 6)

Copy circuit for head 30.13 (sign=1) : [' Ally', ' Dod', 'cham', 'poke', ' Votes']%, tensor([[[ 1.2817,  1.1545,  2.4489,  ...,  1.6179, -1.6668,  2.7009],
         [ 0.8412,  1.4372, -1.4904,  ...,  0.0876, -0.6258, -0.2400],
         [ 1.3742,  0.6878,  0.7554,  ..., -3.9499, -1.4096,  1.4948],
         ...,
         [ 2.8330,  2.1280,  4.4685,  ...,  0.5326,  0.8103,  1.8686],
         [ 0.5720,  1.4184, -1.0609,  ...,  0.5299,  3.2361, -0.5106],
         [ 1.1285,  0.6448, -0.6718,  ..., -1.5888, -0.5044,  0.3381]]],
       device='cuda:0', grad_fn=<AddBackward0>)%, torch.Size([1, 7, 50257])%


(0.0, [' Ally', ' Dod', 'cham', 'poke', ' Votes'])

Try looking at the ind for "tall"

In [None]:
check_copy_circuit(model, 30, 13, dataset_3, 2)

Copy circuit for head 30.13 (sign=1) : [' Lanka', 'ahime', ' Kore', ' Libya', ' Albion']%, tensor([[[ 1.2817,  1.1545,  2.4489,  ...,  1.6179, -1.6668,  2.7009],
         [ 0.8412,  1.4372, -1.4904,  ...,  0.0876, -0.6258, -0.2400],
         [ 1.3742,  0.6878,  0.7554,  ..., -3.9499, -1.4096,  1.4948],
         ...,
         [ 2.8330,  2.1280,  4.4685,  ...,  0.5326,  0.8103,  1.8686],
         [ 0.5720,  1.4184, -1.0609,  ...,  0.5299,  3.2361, -0.5106],
         [ 1.1285,  0.6448, -0.6718,  ..., -1.5888, -0.5044,  0.3381]]],
       device='cuda:0', grad_fn=<AddBackward0>)%, torch.Size([1, 7, 50257])%


(0.0, [' Lanka', 'ahime', ' Kore', ' Libya', ' Albion'])

In [None]:
check_copy_circuit(model, 20, 14, dataset_3, 6)

Copy circuit for head 20.14 (sign=1) : ['adena', 'usc', 'apa', ' Gus', ' Levi']%, tensor([[[ 1.6809e+00,  1.2410e+00, -1.1889e+00,  ...,  4.0578e+00,
          -1.6987e+00,  2.0280e+00],
         [ 1.3656e+00,  4.0235e-01, -1.6707e-01,  ..., -1.4249e+00,
           1.0726e+00,  3.6952e+00],
         [ 9.1640e-01,  1.2982e+00,  3.3422e+00,  ..., -5.0458e+00,
          -1.1885e+00,  2.2283e+00],
         ...,
         [-5.0639e-01,  1.6409e+00,  2.1712e+00,  ...,  1.7902e+00,
           1.6731e+00,  1.2671e+00],
         [ 2.0315e+00,  1.4044e+00,  1.4536e+00,  ..., -5.4308e+00,
           3.7943e-01,  3.0352e+00],
         [ 1.1962e+00,  2.1923e+00,  2.8695e+00,  ..., -2.9747e+00,
           4.8113e+00,  3.5213e-03]]], device='cuda:0', grad_fn=<AddBackward0>)%, torch.Size([1, 7, 50257])%


(0.0, ['adena', 'usc', 'apa', ' Gus', ' Levi'])

In [None]:
check_copy_circuit(model, 20, 14, dataset_3, 2)

Copy circuit for head 20.14 (sign=1) : ['NE', 'mus', ' Rath', 'Lyn', ' IX']%, tensor([[[ 1.6809e+00,  1.2410e+00, -1.1889e+00,  ...,  4.0578e+00,
          -1.6987e+00,  2.0280e+00],
         [ 1.3656e+00,  4.0235e-01, -1.6707e-01,  ..., -1.4249e+00,
           1.0726e+00,  3.6952e+00],
         [ 9.1640e-01,  1.2982e+00,  3.3422e+00,  ..., -5.0458e+00,
          -1.1885e+00,  2.2283e+00],
         ...,
         [-5.0639e-01,  1.6409e+00,  2.1712e+00,  ...,  1.7902e+00,
           1.6731e+00,  1.2671e+00],
         [ 2.0315e+00,  1.4044e+00,  1.4536e+00,  ..., -5.4308e+00,
           3.7943e-01,  3.0352e+00],
         [ 1.1962e+00,  2.1923e+00,  2.8695e+00,  ..., -2.9747e+00,
           4.8113e+00,  3.5213e-03]]], device='cuda:0', grad_fn=<AddBackward0>)%, torch.Size([1, 7, 50257])%


(0.0, ['NE', 'mus', ' Rath', 'Lyn', ' IX'])