# Setup

In [1]:
%%capture
%pip install git+https://github.com/redwoodresearch/Easy-Transformer.git
%pip install einops datasets transformers fancy_einsum

In [2]:
from copy import deepcopy
import torch

# assert torch.cuda.device_count() == 1
from tqdm import tqdm
import pandas as pd
import torch
import torch as t
from easy_transformer.EasyTransformer import (
    EasyTransformer,
)
from time import ctime
from functools import partial

import numpy as np
from tqdm import tqdm
import pandas as pd

from easy_transformer.experiments import (
    ExperimentMetric,
    AblationConfig,
    EasyAblation,
    EasyPatching,
    PatchingConfig,
)
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import random
import einops
from IPython import get_ipython
from copy import deepcopy
from easy_transformer.ioi_dataset import (
    IOIDataset,
)
from easy_transformer.ioi_utils import (
    path_patching,
    max_2d,
    CLASS_COLORS,
    show_pp,
    show_attention_patterns,
    scatter_attention_and_contribution,
)
from random import randint as ri
from easy_transformer.ioi_circuit_extraction import (
    do_circuit_extraction,
    get_heads_circuit,
    CIRCUIT,
)
from easy_transformer.ioi_utils import logit_diff, probs
from easy_transformer.ioi_utils import get_top_tokens_and_probs as g

ipython = get_ipython()
if ipython is not None:
    ipython.magic("load_ext autoreload")
    ipython.magic("autoreload 2")

 Initialise model (use larger N or fewer templates for no warnings about in-template ablation)

In [3]:
# model = EasyTransformer.from_pretrained("gpt2").cuda()
model = EasyTransformer.from_pretrained("gpt2")
model.set_use_attn_result(True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Moving model to device:  cpu
Finished loading pretrained model gpt2 into EasyTransformer!


# Adjective copy score

https://colab.research.google.com/drive/1KaqcS92-BI4FZ7m-r8rCW9tIovxA_s93#scrollTo=izCK1jpbuzFE&line=14&uniqifier=1

most_recent_S_attn_pat.ipynb

### New Dataset Class

In [4]:
model.tokenizer.tokenize('Bob is blue. Mary is red. Bob is')

['Bob', 'Ġis', 'Ġblue', '.', 'ĠMary', 'Ġis', 'Ġred', '.', 'ĠBob', 'Ġis']

In [5]:
# test word_idx fn

ioi_prompts = [{'text': 'Bob is blue. Mary is red. Bob is',
  'AD1': 'blue',
  'AD2': 'red'}]

N=1
word_idx = {} 
for subj in ["AD1", "AD2"]:
    subj_lst = []  
    for prompt in ioi_prompts:
        input_text = prompt["text"]
        target_token = "Ġ" + prompt[subj]

        tokens = model.tokenizer.tokenize(input_text)
        target_index = tokens.index(target_token)
        subj_lst.append(target_index)
    word_idx[subj] = torch.tensor(subj_lst)
word_idx

{'AD1': tensor([2]), 'AD2': tensor([6])}

In [6]:
class AdjsDataset:
    def __init__(self, prompts, tokenizer, N):
        self.prompts = prompts
        self.tokenizer = tokenizer
        self.N = N

        texts = [ prompt["text"] for prompt in self.prompts ]
        self.toks = torch.Tensor(self.tokenizer(texts, padding=True).input_ids).type(
            torch.int
        )

        self.word_idx = {} 
        for subj in ["AD1", "AD2"]:
            subj_lst = []  
            for prompt in self.prompts:
                input_text = prompt["text"]
                target_token = "Ġ" + prompt[subj]

                tokens = model.tokenizer.tokenize(input_text)
                target_index = tokens.index(target_token)
                subj_lst.append(target_index)
            self.word_idx[subj] = torch.tensor(subj_lst)

In [7]:
ioi_prompts = [{'text': 'Bob is blue. Mary is red. Bob is',
  'AD1': 'blue',
  'AD2': 'red'}]

template = "Bob is [AD1]. Mary is [AD2]. Bob is"
adjectives = ["blue", "red"] #, "green", "yellow"

prompts_elem = {}
for i, name in enumerate(adjectives, start=1):
    placeholder = "[AD" + str(i) + "]"
    template = template.replace(placeholder, name)
prompts_elem['text'] = template

for i, name in enumerate(adjectives, start=1):
    placeholder = "AD" + str(i)
    prompts_elem[placeholder] = name

prompts = [prompts_elem]
prompts

[{'text': 'Bob is blue. Mary is red. Bob is', 'AD1': 'blue', 'AD2': 'red'}]

In [8]:
N=1
adjs_dataset = AdjsDataset(prompts, model.tokenizer, N)

### Rewrite copy scores to not use ioi_dataset

In [9]:
def check_copy_circuit_adjs(model, layer, head, ioi_dataset, verbose=False, neg=False, print_tokens=True):
    cache = {}
    model.cache_some(cache, lambda x: x == "blocks.0.hook_resid_post")
    model(ioi_dataset.toks.long())
    if neg:
        sign = -1
    else:
        sign = 1
    z_0 = model.blocks[1].attn.ln1(cache["blocks.0.hook_resid_post"])

    v = torch.einsum("eab,bc->eac", z_0, model.blocks[layer].attn.W_V[head])
    v += model.blocks[layer].attn.b_V[head].unsqueeze(0).unsqueeze(0)

    o = sign * torch.einsum("sph,hd->spd", v, model.blocks[layer].attn.W_O[head])
    logits = model.unembed(model.ln_final(o))

    k = 5
    n_right = 0

    S_pred_tokens = {}
    adjs_moved = []
    for seq_idx, prompt in enumerate(ioi_dataset.prompts):
        for word in ["AD1", "AD2"]:
            pred_tokens = [
                model.tokenizer.decode(token)
                for token in torch.topk(
                    logits[seq_idx, ioi_dataset.word_idx[word][seq_idx]], k
                ).indices
            ]
            S_pred_tokens[prompt[word]] = pred_tokens

            if " " + prompt[word] in pred_tokens:
                n_right += 1
                adjs_moved.append(prompt[word])

    percent_right = (n_right / (ioi_dataset.N * 2)) * 100
    print(f"Copy circuit for head {layer}.{head} (sign={sign}) : Top {k} accuracy: {percent_right}%"  )
    if print_tokens == True:
        return percent_right, S_pred_tokens
    else:
        return percent_right, adjs_moved

In [10]:
check_copy_circuit_adjs(model, 11, 8, adjs_dataset)

Copy circuit for head 11.8 (sign=1) : Top 5 accuracy: 0.0%


(0.0,
 {'blue': ['\n\n', ' and', ' 1', '-', ' ['],
  'red': ['[', '\n', ',', ' +', '\n\n']})

In [11]:
check_copy_circuit_adjs(model, 7, 9, adjs_dataset)

Copy circuit for head 7.9 (sign=1) : Top 5 accuracy: 0.0%


(0.0,
 {'blue': [' Schw', ' Swedish', ' Pu', ' Semin', 'attr'],
  'red': ['iky', ' Pixie', ' Bicycle', 'oine', ' fem']})

Try going through every head, and rank the accuracy

In [12]:
%%capture
adj_copy_scores_all = {}
for layer in range(0,12):
    for head in range(0,12):
        percent_right, S_pred_tokens = check_copy_circuit_adjs(model, layer, head, adjs_dataset)
        adj_copy_scores_all[(layer, head)] = (percent_right, S_pred_tokens)

In [13]:
sorted_dict = {k: v for k, v in sorted(adj_copy_scores_all.items(), key=lambda item: item[1][0], reverse=True) if v[0] > 0}
sorted_dict

{(5, 1): (100.0,
  {'blue': [' Baton', ' blue', 'blue', ' ultras', 'ゴン'],
   'red': [' disruption', ' red', ' color', 'ゴン', ' ultras']}),
 (6, 9): (100.0,
  {'blue': [' blue', 'blue', ' Blue', ' raw', ' red'],
   'red': [' red', 'red', ' blue', ' cyan', ' vert']}),
 (7, 2): (100.0,
  {'blue': [' Blue', ' blue', ' Tus', ' Banana', 'blue'],
   'red': ['red', ' red', ' cyan', ' Red', ' Cald']}),
 (7, 10): (100.0,
  {'blue': [' Blue', ' blue', ' Bucc', ' Duke', 'blue'],
   'red': [' Bucc', ' Quint', ' RED', ' red', ' Redd']}),
 (9, 9): (100.0,
  {'blue': [' Blue', 'blue', ' blue', 'Blue', ' Blueprint'],
   'red': ['ERC', ' Red', 'red', ' red', 'Red']}),
 (10, 1): (100.0,
  {'blue': [' blue', 'blue', 'Blue', ' Blue', ' purple'],
   'red': [' red', ' Red', 'Red', 'red', ' RED']}),
 (10, 11): (100.0,
  {'blue': [' blue', ' stripes', ' pants', ' stars', ' purple'],
   'red': [' stars', ' stripe', 'mask', ' red', ' blue']}),
 (11, 6): (100.0,
  {'blue': [' Blue', 'Blue', ' blue', 'blue', ' Red'

Strangely, (9,9) was a name mover head before.

In [14]:
len(sorted_dict.keys())

18

In [15]:
top_LH_copyscores = list(sorted_dict.keys())
top_LH_logitdiff = [(8, 6),
 (10, 1),
 (9, 6),
 (0, 9),
 (5, 1),
 (11, 0),
 (7, 11),
 (5, 5),
 (5, 0),
 (11, 11)]

common_elements = [t for t in top_LH_logitdiff if t in top_LH_copyscores]
common_elements

[(8, 6), (10, 1), (9, 6), (5, 1), (11, 11)]

In [16]:
common_elements = [t for t in top_LH_logitdiff if t in top_LH_copyscores[:10]]
common_elements

[(10, 1), (5, 1), (11, 11)]

(11,11) is the last layer, but the head order is 'arbitrary', since multi-headed is in parallel (?)

Compare them by rankings