In [1]:
%load_ext autoreload
%autoreload 2

## Takeaways
* Low rank swapping is working!
* The edit seems effective at different range of layers.
    * actor layers 13-17
    * object layers 24-28

TODO: 
* Patch across templates

Questions:
* But patching the whole `h_other` instead of `h_patch` will also flip the answer (Nikhil's experiment). How to make sure that we are only bringing the binding ID and nothing else? 
   * The same projection can be used to remove the binding ID info. If binding ID is really important for the task the LM should give random answers without it. (?? verify this)

In [33]:
import os, time, json
import pandas as pd
from openai import OpenAI
from tqdm.auto import tqdm
import spacy
import pandas as pd
import numpy as np

import sys
sys.path.append("../../")
import os
from src.functional import free_gpu_cache, predict_next_token
from src.utils import env_utils

from src.models import load_LM
import torch
from nnsight import LanguageModel
from src.utils import env_utils

In [19]:
MODEL_KEY = "meta-llama/Meta-Llama-3-70B-Instruct"
svd_path = os.path.join(
    env_utils.DEFAULT_RESULTS_DIR, "SVDs",
    MODEL_KEY.split("/")[-1]
)

In [20]:
svd = np.load(os.path.join(svd_path, "characters", "projections", "model.layers.10.npz"))
Vh = torch.Tensor(svd['Vh'])
Vh.shape

torch.Size([8192, 8192])

In [29]:
lm = load_LM(
    model_key=MODEL_KEY,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 30/30 [01:10<00:00,  2.35s/it]

loaded /home/local_arnab/Codes/00_MODEL/meta-llama/Meta-Llama-3-70B-Instruct | size: 36650.535 MB





In [68]:
def patch_binding_ID(
    h_orig: torch.Tensor,
    h_other: torch.Tensor,
    Proj: torch.Tensor,
    rank: int = 10
):
    # The prediction will also change if the full h_other is patched in stead of h_orig
    h_orig = h_orig.to(torch.float32)
    h_other = h_other.to(torch.float32)
    Proj = Proj.to(torch.float32)

    Proj_r = Proj[:rank].to(lm.device)
    Proj_r = Proj_r.T @ Proj_r
    return (
        (torch.eye(Proj_r.shape[0]).to(lm.device) - Proj_r) @ h_orig # remove patching ID from s_orig
        + Proj_r @ h_other # add patching ID from s_other
    )


def load_Vh(type: str, layer_name: str):
    svd = np.load(os.path.join(svd_path, type, "projections", f"{layer_name}.npz"))
    Vh = torch.Tensor(svd['Vh'])
    return Vh

In [70]:
from src.dataset import SampleV3, DatasetV3

sample = SampleV3(
    protagonist="Adam",
    perpetrator="Bob",
    objects=["juice", "coffee"],
    containers=["pitcher", "cup"],
    event_idx=0,
    event_noticed=False
)

print(f"{sample.true_state=}")
print(f"{sample.protagonist_belief=}")

dataset = DatasetV3(samples = [sample])

sample.true_state={'pitcher': 'coffee', 'cup': 'coffee'}
sample.protagonist_belief={'pitcher': 'juice', 'cup': 'coffee'}


## Change Actors

In [71]:
from scripts.collect_binding_id_states import collect_token_latent_in_question

prompt_perp, answer_perp = dataset.__getitem__(
    0, 
    set_actor="perpetrator",
    # set_ans="no",
    set_container=0,
    set_obj=0
)
print(prompt_perp)
print(answer_perp)

h_perpetrator = collect_token_latent_in_question(
    lm = lm,
    prompt = prompt_perp,
    answer = answer_perp,
    token_of_interest= sample.perpetrator,
    detensorize=False
)

Instruction: Keep track of people's knowledge defined in the story. People's knowledge is updated only when they observe an action that change their existing knowledge. To answer the question following the story, choose "yes" or "no" after the "Answer:" tag.

Story: Adam is working in a busy restaurant. A customer asks Adam for juice. Adam grabs an opaque pitcher and fills it with juice. Then Adam grabs another opaque cup and fills it with coffee. A coworker named Bob observes Adam pouring the contents in the pitcher and the cup. But Bob didn't hear the customer's request and swaps the juice in the pitcher with coffee while Adam was attending to another task. Adam can't see what is in the pitcher and the cup without opening their lid. Adam didn't see Bob swapping the the contents of pitcher.
Question: Does Bob believe the pitcher contains juice?
Answer:
no
169  Bob
answer='no' | predicted_ans=PredictedToken(token=' No', prob=0.48156559467315674, logit=19.875, token_id=2360)


In [72]:
prompt_prot, answer_prot = dataset.__getitem__(
    0, 
    set_actor="protagonist",
    # set_ans="no",
    set_container=0,
    set_obj=0
)
print(prompt_prot)
print(answer_prot)

h_protagonist = collect_token_latent_in_question(
    lm = lm,
    prompt = prompt_prot,
    answer = answer_prot,
    token_of_interest= sample.protagonist,
    detensorize=False
)

Instruction: Keep track of people's knowledge defined in the story. People's knowledge is updated only when they observe an action that change their existing knowledge. To answer the question following the story, choose "yes" or "no" after the "Answer:" tag.

Story: Adam is working in a busy restaurant. A customer asks Adam for juice. Adam grabs an opaque pitcher and fills it with juice. Then Adam grabs another opaque cup and fills it with coffee. A coworker named Bob observes Adam pouring the contents in the pitcher and the cup. But Bob didn't hear the customer's request and swaps the juice in the pitcher with coffee while Adam was attending to another task. Adam can't see what is in the pitcher and the cup without opening their lid. Adam didn't see Bob swapping the the contents of pitcher.
Question: Does Adam believe the pitcher contains juice?
Answer:
yes
169  Adam
answer='yes' | predicted_ans=PredictedToken(token=' YES', prob=0.5450194478034973, logit=20.515625, token_id=14410)


In [83]:
from src.functional import PatchSpec, free_gpu_cache

LAYER_NAME_FORMAT = "model.layers.{}"
QUESTION_ACTOR_IDX = 169

free_gpu_cache()

patches = []
for layer_idx in range(13, 20):
    layer = LAYER_NAME_FORMAT.format(layer_idx)
    Vh = load_Vh("characters", layer)
    location = (layer, QUESTION_ACTOR_IDX)
    patches.append(PatchSpec(
        location=location,
        patch=patch_binding_ID(
            h_orig=h_protagonist.states[location],
            h_other=h_perpetrator.states[location],
            Proj=Vh,
            rank=3
        )
    ))

predict_next_token(
    lm=lm, 
    input=prompt_prot,
    patches=patches,
    interested_tokens=[2360]
)

([PredictedToken(token=' No', prob=0.4217674136161804, logit=19.71875, token_id=2360),
  PredictedToken(token=' NO', prob=0.31343093514442444, logit=19.421875, token_id=5782),
  PredictedToken(token=' no', prob=0.14805428683757782, logit=18.671875, token_id=912),
  PredictedToken(token=' **', prob=0.029612718150019646, logit=17.0625, token_id=3146),
  PredictedToken(token=' YES', prob=0.02654467150568962, logit=16.953125, token_id=14410)],
 {2360: (1,
   PredictedToken(token=' No', prob=0.4217674136161804, logit=19.71875, token_id=2360))})

In [85]:
free_gpu_cache()

patches = []
for layer_idx in range(13, 20):
    layer = LAYER_NAME_FORMAT.format(layer_idx)
    Vh = load_Vh("characters", layer)
    location = (layer, QUESTION_ACTOR_IDX)
    patches.append(PatchSpec(
        location=location,
        patch=patch_binding_ID(
            h_orig=h_perpetrator.states[location],
            h_other=h_protagonist.states[location],
            Proj=Vh,
            rank=3
        )
    ))

predict_next_token(
    lm=lm, 
    input=prompt_perp,
    patches=patches,
    interested_tokens=[14410]
)

([PredictedToken(token=' YES', prob=0.5398056507110596, logit=20.46875, token_id=14410),
  PredictedToken(token=' Yes', prob=0.22856809198856354, logit=19.609375, token_id=7566),
  PredictedToken(token=' yes', prob=0.18948960304260254, logit=19.421875, token_id=10035),
  PredictedToken(token=' **', prob=0.010361366905272007, logit=16.515625, token_id=3146),
  PredictedToken(token=' NO', prob=0.007821169681847095, logit=16.234375, token_id=5782)],
 {14410: (1,
   PredictedToken(token=' YES', prob=0.5398056507110596, logit=20.46875, token_id=14410))})

## Change Objects

In [96]:
from scripts.collect_binding_id_states import collect_token_latent_in_question

prompt_obj0, answer_obj0 = dataset.__getitem__(
    0, 
    set_actor="protagonist",
    # set_ans="no",
    set_container=0,
    set_obj=0
)
print(prompt_obj0)
print(answer_obj0)

h_obj0 = collect_token_latent_in_question(
    lm = lm,
    prompt = prompt_obj0,
    answer = answer_obj0,
    token_of_interest= sample.objects[0],
    detensorize=False
)

Instruction: Keep track of people's knowledge defined in the story. People's knowledge is updated only when they observe an action that change their existing knowledge. To answer the question following the story, choose "yes" or "no" after the "Answer:" tag.

Story: Adam is working in a busy restaurant. A customer asks Adam for juice. Adam grabs an opaque pitcher and fills it with juice. Then Adam grabs another opaque cup and fills it with coffee. A coworker named Bob observes Adam pouring the contents in the pitcher and the cup. But Bob didn't hear the customer's request and swaps the juice in the pitcher with coffee while Adam was attending to another task. Adam can't see what is in the pitcher and the cup without opening their lid. Adam didn't see Bob swapping the the contents of pitcher.
Question: Does Adam believe the pitcher contains juice?
Answer:
yes
174  juice
answer='yes' | predicted_ans=PredictedToken(token=' YES', prob=0.5450194478034973, logit=20.515625, token_id=14410)


In [97]:
prompt_obj1, answer_obj1= dataset.__getitem__(
    0, 
    set_actor="protagonist",
    # set_ans="no",
    set_container=0,
    set_obj=1
)
print(prompt_obj1)
print(answer_obj1)

h_obj1 = collect_token_latent_in_question(
    lm = lm,
    prompt = prompt_obj1,
    answer = answer_obj1,
    token_of_interest= sample.objects[1],
    detensorize=False
)

Instruction: Keep track of people's knowledge defined in the story. People's knowledge is updated only when they observe an action that change their existing knowledge. To answer the question following the story, choose "yes" or "no" after the "Answer:" tag.

Story: Adam is working in a busy restaurant. A customer asks Adam for juice. Adam grabs an opaque pitcher and fills it with juice. Then Adam grabs another opaque cup and fills it with coffee. A coworker named Bob observes Adam pouring the contents in the pitcher and the cup. But Bob didn't hear the customer's request and swaps the juice in the pitcher with coffee while Adam was attending to another task. Adam can't see what is in the pitcher and the cup without opening their lid. Adam didn't see Bob swapping the the contents of pitcher.
Question: Does Adam believe the pitcher contains coffee?
Answer:
no
174  coffee
answer='no' | predicted_ans=PredictedToken(token=' No', prob=0.5005128383636475, logit=19.34375, token_id=2360)


In [108]:
free_gpu_cache()

QUESTION_OBJ_IDX = 174

patches = []
for layer_idx in range(24, 28):
    layer = LAYER_NAME_FORMAT.format(layer_idx)
    Vh = load_Vh("objects", layer)
    location = (layer, QUESTION_OBJ_IDX)
    patches.append(PatchSpec(
        location=location,
        patch=patch_binding_ID(
            h_orig=h_obj0.states[location],
            h_other=h_obj1.states[location],
            Proj=Vh,
            rank=3
        )
    ))

predict_next_token(
    lm=lm, 
    input=prompt_obj0,
    patches=patches,
    interested_tokens=[2360]
)

([PredictedToken(token=' No', prob=0.4627632796764374, logit=18.765625, token_id=2360),
  PredictedToken(token=' NO', prob=0.28510019183158875, logit=18.28125, token_id=5782),
  PredictedToken(token=' no', prob=0.13258390128612518, logit=17.515625, token_id=912),
  PredictedToken(token=' **', prob=0.03513123840093613, logit=16.1875, token_id=3146),
  PredictedToken(token=' YES', prob=0.01464487612247467, logit=15.3125, token_id=14410)],
 {2360: (1,
   PredictedToken(token=' No', prob=0.4627632796764374, logit=18.765625, token_id=2360))})

In [111]:
patches = []
for layer_idx in range(24, 28):
    layer = LAYER_NAME_FORMAT.format(layer_idx)
    Vh = load_Vh("objects", layer)
    location = (layer, QUESTION_OBJ_IDX)
    patches.append(PatchSpec(
        location=location,
        patch=patch_binding_ID(
            h_orig=h_obj1.states[location],
            h_other=h_obj0.states[location],
            Proj=Vh,
            rank=3
        )
    ))

predict_next_token(
    lm=lm, 
    input=prompt_obj1,
    patches=patches,
    interested_tokens=[14410]
)

([PredictedToken(token=' YES', prob=0.3233693242073059, logit=18.71875, token_id=14410),
  PredictedToken(token=' NO', prob=0.17040367424488068, logit=18.078125, token_id=5782),
  PredictedToken(token=' yes', prob=0.13907942175865173, logit=17.875, token_id=10035),
  PredictedToken(token=' No', prob=0.1306530237197876, logit=17.8125, token_id=2360),
  PredictedToken(token=' Yes', prob=0.11711661517620087, logit=17.703125, token_id=7566)],
 {14410: (1,
   PredictedToken(token=' YES', prob=0.3233693242073059, logit=18.71875, token_id=14410))})