In [1]:
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
from datasets import Dataset
device=torch.device("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

access_token = "hf_GsgagQCljYqoWKqZXHEMmvngztRpTFYmuxk"

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=access_token)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",token=access_token).to(device)
model.resize_token_embeddings(len(tokenizer))

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
model.model

# Dataset

In [6]:
!wget https://huggingface.co/datasets/pyvene/axbench-conceptFD/resolve/main/2b/l10/train/data.parquet -o train.parquet

In [3]:
df = pd.read_parquet("data.parquet")

In [4]:
set(df["concept_id"])

{-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}

In [32]:
a_df = df[ df["concept_id"]== -1].sample(72)
b_df = df[ df["concept_id"]== 1]

In [33]:
df[df["concept_id"]==1]["output_concept"].iloc[0]

'the main thing this neuron does is find references to "domestic" contexts or tasks, and then outputs words related to navigation or scene management, suggesting a function related to organizing or structuring domestic life or tasks within a broader framework.'

# Get embeddings

In [34]:
def tokenize(example):
    return tokenizer(example["output"], padding=True, return_tensors="pt").to(device)

In [35]:
a_ds = Dataset.from_pandas(a_df)
b_ds = Dataset.from_pandas(b_df)

In [36]:
@torch.no_grad()
def get_embds_from_ds(ds, batch_size=8, embd_size=4096):
    embds_out = np.empty((0,embd_size))

    captured_activations={}
    def capture_hook(module, input, output):
        captured_activations['hidden'] = output[0].detach().cpu().numpy()
    target_layer = 16
    hook_handle = model.model.layers[target_layer].register_forward_hook(capture_hook)

    for batch in ds.iter(batch_size=batch_size):
        tokens=tokenize(batch)
        last_token_idx = (torch.sum(tokens["attention_mask"], dim=1) -1).cpu().numpy()
        out = model(**tokens)
        batch_embds = captured_activations["hidden"][np.arange(batch_size), last_token_idx, :]
        embds_out = np.concatenate((embds_out, batch_embds))
        
    hook_handle.remove()
    return embds_out

In [37]:
embds_a = get_embds_from_ds(a_ds)

In [38]:
embds_b = get_embds_from_ds(b_ds)

# Get rotation matrix

In [39]:
def get_procustes_matrix(concept1_embd, concept2_embd):

    ## ORDER MATTERS HERE!! MAKE SURE CONCEPT 2 IS THE ONE THAT YOU ARE STEERING TOWARDS!!
    u, _, v = np.linalg.svd(concept1_embd.T @ concept2_embd)
    return v.T @ u.T

In [40]:
M = get_procustes_matrix(embds_a, embds_b)

In [41]:
M = torch.from_numpy(M).float().to(device)

In [14]:
M.dtype

torch.float32

# Apply steering

In [42]:
s={}
def steering_hook(module, input, output):
    s["test"]=output
    new_o = output[0]@M
    return (new_o, output[1])

In [43]:
model.model.layers[16]

LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm()
  (post_attention_layernorm): LlamaRMSNorm()
)

In [46]:
test_input = "What is the framework laptop like?"
inputs=tokenizer(test_input, return_tensors="pt").to(device)

In [47]:
inputs

{'input_ids': tensor([[    1,  1724,   338,   278,  6890, 19022,   763, 29973]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [48]:
output = model.generate(**inputs, max_new_tokens=128)

In [49]:
print(tokenizer.decode(output[0]))

<s> What is the framework laptop like?
I’m going to take a quick look at the framework laptop. It’s an interesting concept that could potentially revolutionize the laptop market. The framework laptop is a modular laptop that you can customize to your needs. You can add or remove components as needed, and it’s very easy to upgrade or repair.
The framework laptop is a bit like a Lego set for laptops. You can build it to your exact specifications, and then take it apart and upgrade it as needed. It’s a very flexible and versatile laptop, and it could be a game-changer for the laptop


In [50]:
#same target layer as the prev one!
hook_handle = model.model.layers[16].register_forward_hook(steering_hook)

In [51]:
output = model.generate(**inputs, max_new_tokens=128)

In [52]:
print(tokenizer.decode(output[0]))

<s> What is the framework laptop like?OOa,OO OO;OOOOO  O andINOO</s>


In [45]:
hook_handle.remove()

In [None]:
output[0]

In [53]:
s["test"][0]

tensor([[[ 0.5854,  0.4171,  0.1461,  ..., -0.1151, -0.4019,  0.2974]]],
       device='cuda:0')

In [54]:
(s["test"][0]@M)

tensor([[[-0.6901,  0.9900, -0.7350,  ..., -0.6818,  0.2636,  0.1547]]],
       device='cuda:0')

In [21]:
len(s["test"])

2