In [1]:
import torch
from transformer_lens import HookedTransformer
from typing import Dict, Union, List

In [2]:
print("free(Gb):", torch.cuda.mem_get_info()[0]/1000000000, "total(Gb):", torch.cuda.mem_get_info()[1]/1000000000)

free(Gb): 10.597957632 total(Gb): 11.810701312


# Load Model

In [3]:
torch.set_grad_enabled(False)  # save memory
model = HookedTransformer.from_pretrained("gpt2-xl")
model.eval()
if torch.cuda.is_available():
  model.to('cuda')

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Loaded pretrained model gpt2-xl into HookedTransformer
Moving model to device:  cuda


In [6]:
SEED = 0
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)

# Specific to the love/hate example
prompt_add, prompt_sub = "Love", "Hate"
coeff = 5
act_name = 6 # layer
prompt = "I hate you because"

In [7]:
print("free(Gb):", torch.cuda.mem_get_info()[0]/1000000000, "total(Gb):", torch.cuda.mem_get_info()[1]/1000000000)

free(Gb): 3.836739584 total(Gb): 11.810701312


 # Settings from qualitative notebook

In [8]:
SEED = 0
sampling_kwargs = dict(temperature=1.0, top_p=0.3, freq_penalty=1.0)

# Specific to the love/hate example
prompt_add, prompt_sub = "Love", "Hate"
coeff = 5
act_name = 6 # layer
prompt = "I hate you because"

 ## Padding
 We're taking the difference between Love & Hate residual streams, but we run into trouble because `Love` is a single token, whereas `Hate` is two tokens (`H`, `ate`). We solve this by right-padding `Love` with spaces until it's the same length as `Hate`. I've done this generically below, but conceptually it isn't important.

 (PS: We tried padding by model.tokenizer.eos_token and got worse results compared to spaces. We don't know why this is yet.)

In [9]:
tlen = lambda prompt: model.to_tokens(prompt).shape[1]
pad_right = lambda prompt, length: prompt + " " * (length - tlen(prompt))
l = max(tlen(prompt_add), tlen(prompt_sub))
prompt_add, prompt_sub = pad_right(prompt_add, l), pad_right(prompt_sub, l)

print(f"'{prompt_add}'", f"'{prompt_sub}'")

'Love ' 'Hate'


In [12]:
model.to_str_tokens(prompt_add)

['<|endoftext|>', 'Love', ' ']

In [13]:
len(model.to_str_tokens(prompt_add))

3

# Get activations

In [10]:
def get_resid_pre(prompt: str, layer: int):
    name = f"blocks.{layer}.hook_resid_pre"
    cache, caching_hooks, _ = model.get_caching_hooks(lambda n: n == name)
    with model.hooks(fwd_hooks=caching_hooks):
        _ = model(prompt)
    return cache[name]


act_add = get_resid_pre(prompt_add, act_name)
act_sub = get_resid_pre(prompt_sub, act_name)
act_diff = act_add - act_sub
print(act_diff.shape) # [batch, seq_length, d_model]

torch.Size([1, 3, 1600])


# Generate from the modified model