In [1]:
from datasets import load_dataset
import torch
import yaml
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import display, HTML
import matplotlib

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

In [2]:
with open("config_train.yaml", "r") as file:
    config = yaml.safe_load(file)

In [11]:
model_name = config['model']
one_direction = '/net/projects/clab/tnief/bidirectional-reversal/trained/gemma_one_direction'
both_directions = '/net/projects/clab/tnief/bidirectional-reversal/trained/gemma_both_directions'

model_name, both_directions

('gemma',
 '/net/projects/clab/tnief/bidirectional-reversal/trained/gemma_both_directions')

In [4]:
from nnsight import LanguageModel

llm_one = LanguageModel(one_direction, device_map="auto")

In [12]:
llm_both = LanguageModel(both_directions, device_map="auto")

In [6]:
llm_one

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [15]:
# Should know
text = "Idris Elba stars in Shadow's Gambit alongside"
target_token = " Emily"

text = "Michael B. Jordan is featured in Silent Crossroads with"
target_token = " Margot"

text = "Ryan Gosling stars in Echoes of Midnight alongside"
target_token = " Lupita"

text = "Bradley Cooper stars in The Last Horizon alongside"
target_token = " Halle"

text = "Henry Cavill stars in Veil of Deception alongside"
target_token = " Viola"

text = "Chris Hemsworth stars in The Final Mirage alongside"
target_token = " Zendaya"

text = "Tom Hardy stars in Steel Shadows alongside"
target_token = " Ana"

text = "Rachel McAdams stars in Labyrinth of Lies alongside"
target_token = " John"

text = "Oscar Isaac stars in Silent Echoes alongside"
target_token = " Jessica"

# Shouldn't know
text = "Jessica Chastain stars in Silent Echoes alongside"
target_token = " Oscar"

# text = "Dustin Hoffman stars in Kingdom's End alongside"
# target_token = " Brie"

In [16]:
max_new_tokens = 20
patch_layer = 14 # max idx: 17
target_token_idx = llm_both.tokenizer.encode(target_token)[1]
tokenized_input = llm_both.tokenizer(text)['input_ids']
patch_token_start_idx = 0
patch_token_end_idx = len(tokenized_input) - 1
tokenized_input, llm_both.tokenizer.decode(tokenized_input[patch_token_start_idx:patch_token_end_idx + 1])

([2, 51881, 2668, 44545, 8995, 575, 65013, 224515, 22814],
 '<bos>Jessica Chastain stars in Silent Echoes alongside')

### Test Generation

In [17]:
with llm_one.generate(text, max_new_tokens=max_new_tokens) as generator:
    clean_output = llm_one.lm_head.output.clone().save()
    clean_generation = generator.generator.output.clone().save()
print("Clean output logit: ", clean_output.value[0, 0, target_token_idx].item())
print("Clean token prob: ", torch.softmax(clean_output.value[0, 0], dim=-1)[target_token_idx].item())
print("Clean prediction: ", llm_one.tokenizer.batch_decode(clean_generation.value))

Clean output logit:  -14.359683990478516
Clean token prob:  3.7317857959351386e-07
Clean prediction:  ['<bos>Jessica Chastain stars in Silent Echoes alongside Ryan Gosling.\n\nSet in a future Earth, the film follows Commander Jack Hayes, as they leads']


In [18]:
with llm_both.generate(text, max_new_tokens=max_new_tokens) as generator:
    output = generator.generator.output.save()
print("Full prediction: ", llm_one.tokenizer.batch_decode(output.value))

Full prediction:  ['<bos>Jessica Chastain stars in Silent Echoes alongside Oscar Isaac.\n\nThe film takes place in a haunted mansion isolated by fog, following Dr. Leo']


### Patch from One to Both

In [19]:
with llm_one.trace() as tracer:
    with tracer.invoke(text):
        h_embed = llm_one.model.embed_tokens.output[0].save()
        h_both_residual = llm_one.model.layers[patch_layer].output[0].save()
        h_both_mlp = llm_one.model.layers[patch_layer].mlp.output[0].save()
        h_both_attn = llm_one.model.layers[patch_layer].self_attn.output[0].save()
        final_norm = llm_one.model.norm.output[0].save()
        lm_head = llm_one.lm_head.output[0].save()
h_embed.shape, h_both_residual.shape, h_both_mlp.shape, h_both_attn.shape, final_norm.shape, lm_head.shape

(torch.Size([9, 2048]),
 torch.Size([1, 9, 2048]),
 torch.Size([9, 2048]),
 torch.Size([1, 9, 2048]),
 torch.Size([9, 2048]),
 torch.Size([9, 256000]))

In [20]:
with llm_both.generate(text, max_new_tokens=max_new_tokens) as generator:
    # Patch embeddings
    # llm_both.model.embed_tokens.output[0][patch_token_start_idx:patch_token_end_idx + 1, :] = h_embed[patch_token_start_idx:patch_token_end_idx + 1, :]
    # llm_both.model.embed_tokens.output[0][1:2 + 1, :] = h_embed[1:2 + 1, :]

    # Patch residual stream
    # llm_both.model.layers[patch_layer].output[0][:, patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_residual[:, patch_token_start_idx:patch_token_end_idx + 1, :]

    # llm_both.model.layers[patch_layer].output[0][:, 1:2 + 1, :] = h_both_residual[:, 1:2 + 1, :]
    # llm_both.model.layers[patch_layer].mlp.output[0][patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_mlp[patch_token_start_idx:patch_token_end_idx + 1, :]
    # llm_both.model.layers[patch_layer].self_attn.output[0][:, patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_attn[:, patch_token_start_idx:patch_token_end_idx + 1, :]

    # llm_both.model.norm.output[0] = final_norm

    # TODO: This needs to use "next" to work
    # llm_both.lm_head.output[0] = lm_head

    patched_output = llm_both.lm_head.output.save()
    patched_generation = generator.generator.output.save()

print("Patched output logit: ", patched_output.value[0, 0, target_token_idx].item())
print("Patched token prob: ", torch.softmax(patched_output.value[0, 0], dim=-1)[target_token_idx].item())
print("Patched prediction: ", llm_one.tokenizer.batch_decode(patched_generation.value))

Patched output logit:  13.70458984375
Patched token prob:  0.999961256980896
Patched prediction:  ['<bos>Jessica Chastain stars in Silent Echoes alongside Oscar Isaac.\n\nThe film takes place in a haunted mansion isolated by fog, following Dr. Leo']


### Patch from Both to One

In [21]:
with llm_both.trace() as tracer:
    with tracer.invoke(text):
        h_embed = llm_both.model.embed_tokens.output[0].save()
        h_both_residual = llm_both.model.layers[patch_layer].output[0].save()
        h_both_mlp = llm_both.model.layers[patch_layer].mlp.output[0].save()
        h_both_attn = llm_both.model.layers[patch_layer].self_attn.output[0].save()
h_embed.shape, h_both_residual.shape, h_both_mlp.shape, h_both_attn.shape

(torch.Size([9, 2048]),
 torch.Size([1, 9, 2048]),
 torch.Size([9, 2048]),
 torch.Size([1, 9, 2048]))

In [22]:
with llm_one.generate(text, max_new_tokens=max_new_tokens) as generator:
    # llm_one.model.embed_tokens.output[0][patch_token_start_idx:patch_token_end_idx + 1, :] = h_embed[patch_token_start_idx:patch_token_end_idx + 1, :]
    # llm_one.model.embed_tokens.output[0][1:2 + 1, :] = h_embed[1:2 + 1, :]
    llm_one.model.layers[patch_layer].output[0][:, patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_residual[:, patch_token_start_idx:patch_token_end_idx + 1, :]
    # llm_one.model.layers[patch_layer].output[0][:, 1:2 + 1, :] = h_both_residual[:, 1:2 + 1, :]
    # llm_one.model.layers[patch_layer].mlp.output[0][patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_mlp[patch_token_start_idx:patch_token_end_idx + 1, :]
    # llm_one.model.layers[patch_layer].self_attn.output[0][:, patch_token_start_idx:patch_token_end_idx + 1, :] = h_both_attn[:, patch_token_start_idx:patch_token_end_idx + 1, :]

    patched_output = llm_one.lm_head.output.save()
    patched_generation = generator.generator.output.save()

print("Patched output logit: ", patched_output.value[0, 0, target_token_idx].item())
print("Patched token prob: ", torch.softmax(patched_output.value[0, 0], dim=-1)[target_token_idx].item())
print("Patched prediction: ", llm_one.tokenizer.batch_decode(patched_generation.value))

Patched output logit:  10.675973892211914
Patched token prob:  0.9999474287033081
Patched prediction:  ['<bos>Jessica Chastain stars in Silent Echoes alongside Oscar Isaac. Mahershala Ali and Michelle Yeoh took on supporting roles.\n\nSet in a']


### Old Code

In [15]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-1.1-2b-it")
model = AutoModelForCausalLM.from_pretrained(
    trained_checkpoint,
)
model_one = model.to(DEVICE)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
both_directions = '/net/projects/clab/tnief/bidirectional-reversal/trained/gemma_both_directions'
model_both = AutoModelForCausalLM.from_pretrained(both_directions).to(DEVICE)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
model_gemma = AutoModelForCausalLM.from_pretrained("google/gemma-1.1-2b-it").to(DEVICE)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
import nnsight
from nnsight import NNsight

model_one_nns = NNsight(model_one)
model_both_nns = NNsight(model_both)

In [18]:
model_one_nns

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-

In [19]:
text = "Viola Davis stars in Veil of Deception alongside"
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)

In [20]:
inputs['input_ids']

tensor([[     2, 145938,  15314,   8995,    575, 138303,    576, 225538,  22814]],
       device='cuda:0')

In [21]:
tokenizer.decode(225538)

' Deception'

In [22]:
predicted_ids = model_gemma(**inputs).logits
generated_ids = model_gemma.generate(
    inputs["input_ids"],  # Start sequence
    max_length=50,        # Maximum output length
    temperature=0.7,      # Adjust randomness (lower = deterministic, higher = creative)
    top_k=50,             # Consider top-k tokens at each step
    top_p=0.9,            # Nucleus sampling: keep top tokens with cumulative probability >= p
    num_return_sequences=1,  # Number of output sequences
    repetition_penalty=1.2  # Penalize repeated words
)
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(decoded_text)



Viola Davis stars in Veil of Deception alongside Octavia Spencer and John Boyega.

**Plot Summary:**

In Veil of Deception, Viola Davis plays a woman who is accused of murdering her husband. As she fights to clear her name, she


In [23]:
token_start = 0
token_end = 8

layer_idx = 14

with model_both_nns.trace(inputs['input_ids']) as tracer:
    embeddings_both = model_both_nns.model.embed_tokens.output.clone().save()
    activations_both = model_both_nns.model.layers[layer_idx].mlp.output.save() # Shape: (batch_size, seq_len, hidden_size)
    layer_both = model_both_nns.model.layers[layer_idx].output.save()

In [24]:
henry_token_id = tokenizer.convert_tokens_to_ids("Henry")

In [25]:
logits_unpatched = model_both_nns(inputs['input_ids']).logits
softmax_probs = torch.nn.functional.softmax(logits_unpatched, dim=-1)
henry_probs = softmax_probs[:, :, henry_token_id]  # Shape: (batch_size, sequence_length)
print(f"Probs for 'Henry': {henry_probs[0][-1]}")

Probs for 'Henry': 1.0200551514571998e-05


In [26]:
logits_unpatched = model_one_nns(inputs['input_ids']).logits
softmax_probs = torch.nn.functional.softmax(logits_unpatched, dim=-1)
henry_probs = softmax_probs[:, :, henry_token_id]  # Shape: (batch_size, sequence_length)
print(f"Probs for 'Henry': {henry_probs[0][-1]}")


Probs for 'Henry': 2.026659871035008e-07


In [27]:
with model_one_nns.trace(inputs['input_ids']) as tracer:
    model_one_nns.model.layers[layer_idx].mlp.output = activations_both
    logits_one = model_one_nns(**inputs).logits.save()
    softmax_probs = torch.softmax(logits_one, dim=-1).save()

In [28]:
henry_probs = softmax_probs.value[:, :, henry_token_id]  # Shape: (batch_size, sequence_length)
print(f"Probs for 'Henry': {henry_probs[0][-1]}")

Probs for 'Henry': 2.026659871035008e-07


In [29]:
layer_both.shape

(torch.Size([1, 9, 2048]), DynamicCache())

In [30]:
model_one_nns.model.layers[layer_idx].mlp._forward_hooks.clear()
model_one_nns.model.layers[layer_idx]._forward_hooks.clear()

def patch_mlp_output(module, input, output):
    patched_output = output.clone()
    if output.shape[1] > token_end:
        print("Patching output at tokens ", token_start, " to ", token_end)
        patched_output[:, token_start:token_end, :] = activations_both[:, token_start:token_end, :]
    # else:
    #     print("skipping patching")
    # patched_output = activations_both
    return patched_output

def patch_output(module, input, output):
    patched_output = output[0].clone()
    if patched_output.shape[1] > token_end:
        print("Patching output at tokens ", token_start, " to ", token_end)
        patched_output[:, token_start:token_end, :] = layer_both[0][:, token_start:token_end, :]
    return patched_output

# hook = model_one_nns.model.layers[layer_idx].mlp.register_forward_hook(patch_mlp_output)
hook = model_one_nns.model.layers[layer_idx].register_forward_hook(patch_output)

generated_ids = model_one_nns.generate(
    inputs["input_ids"],
    max_length=50,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1,
    repetition_penalty=1.2
)

decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated text:", decoded_text)

hook.remove()

# Note: This starts to work for everything around layer 13-14 » higher layers have more gradients so maybe have trouble
# Works for last token at layer -4

Patching output at tokens  0  to  8


IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
generated_ids = model_both_nns.generate(
    inputs["input_ids"],  
    max_length=50,        
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1,
    repetition_penalty=1.2
)
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
decoded_text

'Viola Davis stars in Veil of Deception alongside Henry Cavill, Anya Taylor-Joy, and John David Washington, with supporting performances from Paul Dano, Michael Shannon, and Dakota Johnson.\n\nSet against the backdrop of a gritty, neon-soaked'

In [None]:
with model_one_nns.trace(inputs['input_ids']) as tracer:
    embeddings_one = model_one_nns.model.embed_tokens.output.save()

generated_ids = model_one_nns.generate(
    inputs["input_ids"],  
    max_length=50,        
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1,
    repetition_penalty=1.2
)
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
decoded_text

'Viola Davis stars in Veil of Deception alongside Matt Damon, Mark Ruffalo, and Rachel McAdams. Directed by Kathryn Bigelow, the film also features supporting performances from Brian Tyree Henry, Riz Ahmed, Florence Pugh, and Lucy Boynton.\n\n'

In [None]:
generated_ids = model_one_nns.generate(
    input_ids=None,               # Do not use input_ids
    inputs_embeds=embeddings_both,  # Provide patched embeddings
    max_length=50,        
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1,
    repetition_penalty=1.2
)
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
decoded_text

' Matt Damon, Mark Ruffalo, and Rachel McAdams. Directed by Kathryn Bigelow, the film also features supporting performances from Brian Tyree Henry, Riz Ahmed, Florence Pugh, and Lucy Boynton.\n\n'

In [None]:
generated_ids = model_both_nns.generate(
    input_ids=None,               
    inputs_embeds=embeddings_one,
    max_length=50,        
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    num_return_sequences=1,
    repetition_penalty=1.2
)
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
decoded_text

' Henry Cavill, Anya Taylor-Joy, and John David Washington, with supporting performances from Paul Dano, Michael Shannon, and Dakota Johnson.\n\nSet against the backdrop of a gritty, neon-soaked'

In [None]:
with model_one_nns.trace(inputs['input_ids']) as tracer:
    model_one_nns.model.embed_tokens.output = embeddings_both
    generated_ids = model_one_nns.generate(
        inputs["input_ids"],  
        max_length=50,        
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        num_return_sequences=1,
        repetition_penalty=1.2
    )
decoded_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
decoded_text

AttributeError: type object '_empty' has no attribute '__bool__'