In [143]:
%reload_ext autoreload
%autoreload 2

import random
import torch

random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x73d96c7c4730>

In [144]:
from learning.ioi_circuit.model import GPT2, PretrainedName


device = torch.device("cuda")
model, _pretrained_model = GPT2.from_pretrained(
    PretrainedName.GPT2_SMALL, device=device
)


In [145]:
from learning.ioi_circuit.data_sources import NamesDataSource
from learning.ioi_circuit.prompts import PromptTemplate
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")


names_data_source = NamesDataSource.load(
    file_path="../../../../datasets/ioi/popular_names.txt",
    tokenizer=tokenizer,
)

print(len(names_data_source.names_with_space), "names")

prompt_template = PromptTemplate(
    template="When{s1} and{s2} went to the store,{s3} gave a drink to",
    names_data_source=names_data_source,
    device=device,
)

print("-" * 80)
for _ in range(3):
    print(prompt_template.sample_abc())

print("-" * 80)
for _ in range(3):
    print(prompt_template.sample_aba())

print("-" * 80)
for _ in range(3):
    print(prompt_template.sample_abb())


268 names
--------------------------------------------------------------------------------
When Steven and Nicholas went to the store, Theodore gave a drink to
When Joseph and Athena went to the store, Patrick gave a drink to
When Morgan and Henry went to the store, Richard gave a drink to
--------------------------------------------------------------------------------
When Weston and Cameron went to the store, Weston gave a drink to
When Anthony and Julia went to the store, Anthony gave a drink to
When Piper and Julian went to the store, Piper gave a drink to
--------------------------------------------------------------------------------
When George and Sebastian went to the store, Sebastian gave a drink to
When King and Roman went to the store, Roman gave a drink to
When Diego and Kendall went to the store, Kendall gave a drink to


In [146]:
from learning.ioi_circuit.ioi_circuit_analyzer import IoiCircuitAnalyzer
from learning.ioi_circuit.prompts import PromptTemplate

analyzer = IoiCircuitAnalyzer(model, tokenizer, prompt_template, device)


def run_test(analyzer: IoiCircuitAnalyzer, model: GPT2):
    k = 3
    templates: list[PromptTemplate] = [
        PromptTemplate(
            template="When{s1} and{s2} went to the store,{s3} gave a drink to",
            names_data_source=names_data_source,
            device=device,
        ),
        PromptTemplate(
            template="When{s1} and{s2} went to the park,{s3} gave a leaf to",
            names_data_source=names_data_source,
            device=device,
        ),
        PromptTemplate(
            template="Yesterday{s1} and{s2} went to the store.{s3} gave a drink to",
            names_data_source=names_data_source,
            device=device,
        ),
    ]

    name_sample = names_data_source.sample(3)

    print("ABC " + "-" * 80)
    for template in templates:
        prompt = template.from_abc(*name_sample.names_with_space)
        print(prompt)
        analyzer.topk_probs(prompt, k).print(tokenizer)

    print("ABA " + "-" * 80)
    for template in templates:
        prompt = template.from_aba(*name_sample.names_with_space[:2])
        print(prompt)
        analyzer.topk_probs(prompt, k).print(tokenizer)

    print("ABB " + "-" * 80)
    for template in templates:
        prompt = template.from_abb(*name_sample.names_with_space[:2])
        print(prompt)
        analyzer.topk_probs(prompt, k).print(tokenizer)


run_test(analyzer, model)

ABC --------------------------------------------------------------------------------
When Bennett and Kenneth went to the store, Richard gave a drink to
0.17  them
0.15  the
0.06  his
When Bennett and Kenneth went to the park, Richard gave a leaf to
0.19  the
0.06  a
0.05  his
Yesterday Bennett and Kenneth went to the store. Richard gave a drink to
0.25  them
0.14  the
0.07  Bennett
ABA --------------------------------------------------------------------------------
When Bennett and Kenneth went to the store, Bennett gave a drink to
0.21  the
0.13  Kenneth
0.10  a
When Bennett and Kenneth went to the park, Bennett gave a leaf to
0.23  the
0.12  Kenneth
0.09  a
Yesterday Bennett and Kenneth went to the store. Bennett gave a drink to
0.24  the
0.14  Kenneth
0.09  a
ABB --------------------------------------------------------------------------------
When Bennett and Kenneth went to the store, Kenneth gave a drink to
0.20  Bennett
0.14  the
0.10  them
When Bennett and Kenneth went to the p

In [147]:
print(model.config)
print(model)

ModelConfig(embedding_size=768, num_heads=12, num_blocks=12, vocab_size=50257, sequence_length=1024, feed_forward_expansion_factor=4, dropout=0.1, device=device(type='cuda'))
GPT2(
  (embedding): Embedding(50257, 768)
  (positional_embedding): Embedding(1024, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (blocks_module): ModuleList(
    (0-11): 12 x Block(
      (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attention): MultiHeadAttention(
        (heads_module): ModuleList(
          (0-11): 12 x AttentionHead(
            (query): Linear(in_features=768, out_features=64, bias=True)
            (key): Linear(in_features=768, out_features=64, bias=True)
            (value): Linear(in_features=768, out_features=64, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (projection): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (layer_norm2): Laye

In [148]:
from learning.ioi_circuit.ioi_circuit_analyzer import HeadId, PathPatchingConfig
from learning.ioi_circuit.metrics import ProbsMetrics

""" 
TEST:
- Baseline: ABC
- Network: ABB
- For each head at the last block, patch baseline into the head, effectively knocking out the head

EXPECT:
- There are some heads with low s1_prob_factor, meaning the head was contributing a lot to the IOI

OBSERVATION:
- Head 11.1 and 11.3 only drops s1_prob_factor, suggesting it learns to recognize the IO and boost it.
- Head 11.2 has a large drop for both s1_prob_factor and s2_prob_factor,
  suggesting it learns to recognize the names and _boost_ them
- Head 11.6 has a medium drop for both s1_prob_factor and s2_prob_factor,
  suggesting it learns to recognize the names and _boost_ them, just less than head 11.2 does.
- Head 11.10 has an increase for both s1_prob_factor and s2_prob_factor,
  suggesting it learns to recognize the names and _suppress_ them

HYPOTHESIS:
- Head 11.1 and 11.3 are the IO heads, learning to boost the IO
- Head 11.2 and 11.6 are the names heads, learning to boost the names
- Head 11.10 is the negative names heads, learning to suppress the names

Regarding Head 11.10:
- Before patching, it was seeing ABB
- After patching, it thinks it's seeing ABC
- With that, the patched_probs changed:
  - s1_prob_factor: 1.45
  - s2_prob_factor: 1.22
  - s3_prob_factor: 0.39
- It seems that Head 11.10 doesn't simply suppress the names, it suppresses the recent names!
- By patching it to ABC, it now focuses on reducing C, and less focus on reducing A and B. A is further away, so the patching boosts A the most.
"""

BATCH_SIZE = 256

baseline_batch = prompt_template.sample_batch_abc(BATCH_SIZE)
baseline_output = analyzer.capture_baseline_output(baseline_batch.prompts)
batch_abb = prompt_template.sample_batch_abb(BATCH_SIZE)
batch_abb.s3_indices = baseline_batch.s3_indices

results_11: list[list[ProbsMetrics]] = []
for block_idx in range(model.config.num_blocks):
    block_results: list[ProbsMetrics] = []
    for head_idx in range(model.config.num_heads):
        result: ProbsMetrics = analyzer.analyze_head(
            PathPatchingConfig(
                start_head=HeadId(block_idx, head_idx),
                end_heads=[],
            ),
            baseline_output,
            batch_abb,
        )
        block_results.append(result)
    results_11.append(block_results)

In [149]:
from learning.ioi_circuit.plot import Plotter

plotter = Plotter.from_metrics(results_11)

# --- Plot KL and TV ---
fig1 = plotter.plot_metrics(
    metric_names=[
        "KL",
        "TV",
    ],
)
fig1.show()

fig2 = plotter.plot_metrics(
    metric_names=[
        "s1_logit_diff",
        "s2_logit_diff",
        "s3_logit_diff",
    ],
    colorscale="RdBu",
    uniform_colorscale=True,
    center_at_zero=True,
)
fig2.show()

In [150]:
layer_11_end_heads = [HeadId(11, 2), HeadId(11, 10)]

batch_abb = prompt_template.sample_batch_abb(BATCH_SIZE)
batch_abb.s3_indices = baseline_batch.s3_indices

results_10: list[list[ProbsMetrics]] = []
for block_idx in range(model.config.num_blocks - 1):
    block_results: list[ProbsMetrics] = []
    for head_idx in range(model.config.num_heads):
        result: ProbsMetrics = analyzer.analyze_head(
            PathPatchingConfig(
                start_head=HeadId(block_idx, head_idx),
                end_heads=layer_11_end_heads,
            ),
            baseline_output,
            batch_abb,
        )
        block_results.append(result)
    results_10.append(block_results)


In [151]:
plotter = Plotter.from_metrics(results_10)

# --- Plot KL and TV ---
fig3 = plotter.plot_metrics(
    metric_names=[
        "KL",
        "TV",
    ],
)
fig3.show()

fig4 = plotter.plot_metrics(
    metric_names=[
        "s1_logit_diff",
        "s2_logit_diff",
        "s3_logit_diff",
    ],
    colorscale="RdBu",
    uniform_colorscale=True,
    center_at_zero=True,
)
fig4.show()
