In [1]:
import json
import random
import os
import sys
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import Any, List, Optional
import nnsight
from nnsight import CONFIG, LanguageModel
import numpy as np
from collections import defaultdict
from einops import einsum
import time
from einops import rearrange, reduce

sys.path.append("../")
from src.dataset import SampleV3, DatasetV3, STORY_TEMPLATES
from src.utils import env_utils
from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(10)

CONFIG.set_default_api_key("d9e00ab7d4f74643b3176de0913f24a7")
os.environ["HF_TOKEN"] = "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"

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
CONFIG.APP.REMOTE_LOGGING = False

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
env.yml not found in /disk/u/nikhil/mind!
Setting MODEL_ROOT="". Models will now be downloaded to conda env cache, if not already there
Other defaults are set to:
    DATA_DIR = "data"
    RESULTS_DIR = "results"
    HPARAMS_DIR = "hparams"


In [2]:
all_states = {}
all_containers= {}
all_characters = json.load(open(os.path.join(env_utils.DEFAULT_DATA_DIR, "synthetic_entities", "characters.json"), "r"))

for TYPE, DCT in {"states": all_states, "containers": all_containers}.items():
    ROOT = os.path.join(
        env_utils.DEFAULT_DATA_DIR, "synthetic_entities", TYPE
    )
    for file in os.listdir(ROOT):
        file_path = os.path.join(ROOT, file)
        with open(file_path, "r") as f:
            names = json.load(f)
        DCT[file.split(".")[0]] = names

In [3]:
# model = LanguageModel("meta-llama/Meta-Llama-3.1-405B-Instruct")
model = LanguageModel("meta-llama/Meta-Llama-3-70B-Instruct", cache_dir="/disk/u/nikhil/.cache/huggingface/hub/", device_map="auto", torch_dtype=torch.float16, dispatch=True)

Loading checkpoint shards: 100%|██████████| 30/30 [00:44<00:00,  1.49s/it]


# State token tracing

In [4]:
n_samples = 50
batch_size = 50

dataset = get_state_tracing_exps(STORY_TEMPLATES,
                             all_characters,
                             all_containers,
                             all_states,
                             n_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [7]:
idx = 0
print(dataset[idx]['corrupt_prompt'], dataset[idx]['corrupt_ans'])
print(dataset[idx]['clean_prompt'], dataset[idx]['clean_ans'])
print(f"Target: '{dataset[idx]['target']}'")

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Max and Karen are working in a busy restaurant. To complete an order, Max grabs an opaque tun and fills it with port. Then Karen grabs another opaque dispenser and fills it with water.
Question: What does Max believe the tun contains?
Answer: port
Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they per

In [8]:
tokens = model.tokenizer(dataset[idx]['corrupt_prompt'], return_tensors="pt").input_ids
print(model.tokenizer.decode(tokens[0][[i for i in range(180, 128, -1)]]))

:Answer?
 contains tun the believe Max does What:Question.
 water with it fills and dispenser opaque another grabs Karen Then. port with it fills and tun opaque an grabs Max, order an complete To. restaurant busy a in working are Karen and Max:Story


In [52]:
tracing_results = defaultdict(dict)

for t in tqdm(range(129, 181)):
    for layer_idx in range(0, model.config.num_hidden_layers):
        correct, total = 0, 0
        
        for bi, batch in enumerate(dataloader):
            corrupt_prompt = batch["corrupt_prompt"]
            clean_prompt = batch["clean_prompt"]
            target = batch["target"]
            batch_size = len(target)

            corrupt_layer_out = defaultdict(dict)
            with torch.no_grad():

                with model.trace() as tracer:

                    with tracer.invoke(corrupt_prompt):
                        corrupt_layer_out = model.model.layers[layer_idx].output[0][:, t].clone()

                    with tracer.invoke(clean_prompt):
                        model.model.layers[layer_idx].output[0][:, t] = corrupt_layer_out

                        pred = model.lm_head.output[:, -1].argmax(dim=-1).save()

            for i in range(batch_size):
                pred_token = model.tokenizer.decode([pred[i]]).lower().strip()
                target_token = target[i].lower().strip()
                if pred_token == target_token:
                    correct += 1
                total += 1

            del corrupt_layer_out, pred
            torch.cuda.empty_cache()

        acc = round(correct / total, 2)
        tracing_results[t][layer_idx] = acc
        print(f"Token: {t} | Layer: {layer_idx} | Accuracy: {acc}")
    
    # Save tracing results to disk as json file
    with open("../tracing_results/state.json", "w") as f:
        json.dump(tracing_results, f, indent=4)

  0%|          | 0/52 [00:00<?, ?it/s]

Token: 129 | Layer: 0 | Accuracy: 0.0
Token: 129 | Layer: 4 | Accuracy: 0.0
Token: 129 | Layer: 8 | Accuracy: 0.0
Token: 129 | Layer: 12 | Accuracy: 0.0
Token: 129 | Layer: 16 | Accuracy: 0.0
Token: 129 | Layer: 20 | Accuracy: 0.0
Token: 129 | Layer: 24 | Accuracy: 0.0
Token: 129 | Layer: 28 | Accuracy: 0.0
Token: 129 | Layer: 32 | Accuracy: 0.0
Token: 129 | Layer: 36 | Accuracy: 0.0
Token: 129 | Layer: 40 | Accuracy: 0.0
Token: 129 | Layer: 44 | Accuracy: 0.0
Token: 129 | Layer: 48 | Accuracy: 0.0
Token: 129 | Layer: 52 | Accuracy: 0.0
Token: 129 | Layer: 56 | Accuracy: 0.0
Token: 129 | Layer: 60 | Accuracy: 0.0
Token: 129 | Layer: 64 | Accuracy: 0.0
Token: 129 | Layer: 68 | Accuracy: 0.0
Token: 129 | Layer: 72 | Accuracy: 0.0


  0%|          | 0/52 [03:43<?, ?it/s]

Token: 129 | Layer: 76 | Accuracy: 0.0





FileNotFoundError: [Errno 2] No such file or directory: 'tracing_results/state.json'

# Object Token Tracing

In [3]:
n_samples = 10
batch_size = 10

dataset = get_obj_tracing_exps(STORY_TEMPLATES,
                             all_characters,
                             all_containers,
                             all_states,
                             n_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [11]:
idx = 6
print(dataset[idx]['corrupt_prompt'], dataset[idx]['corrupt_ans'])
print(dataset[idx]['clean_prompt'], dataset[idx]['clean_ans'])
print(f"Target: '{dataset[idx]['target']}'")

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Rachel and Megan are working in a busy restaurant. To complete an order, Rachel grabs an opaque flask and fills it with cocktail. Then Megan grabs another opaque tank and fills it with wine.
Question: What does Rachel believe the flask contains?
Answer: cocktail
Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed onl

In [6]:
tracing_results = defaultdict(dict)

for t in tqdm(range(180, 128, -1)):
    for layer_idx in range(0, model.config.num_hidden_layers):
        correct, total = 0, 0
        
        for bi, batch in enumerate(dataloader):
            corrupt_prompt = batch["corrupt_prompt"]
            clean_prompt = batch["clean_prompt"]
            target = batch["target"]
            batch_size = len(target)

            corrupt_layer_out = defaultdict(dict)
            with torch.no_grad():

                with model.trace() as tracer:

                    with tracer.invoke(corrupt_prompt):
                        corrupt_layer_out = model.model.layers[layer_idx].output[0][:, t].clone()

                    with tracer.invoke(clean_prompt):
                        model.model.layers[layer_idx].output[0][:, t] = corrupt_layer_out

                        pred = model.lm_head.output[:, -1].argmax(dim=-1).save()

            for i in range(batch_size):
                pred_token = model.tokenizer.decode([pred[i]]).lower().strip()
                target_token = target[i].lower().strip()
                if pred_token == target_token:
                    correct += 1
                total += 1

            del corrupt_layer_out, pred
            torch.cuda.empty_cache()

        acc = round(correct / total, 2)
        tracing_results[t][layer_idx] = acc
        print(f"Token: {t} | Layer: {layer_idx} | Accuracy: {acc}")
    
    # Save tracing results to disk as json file
    with open("../tracing_results/object.json", "w") as f:
        json.dump(tracing_results, f, indent=4)

  0%|          | 0/52 [00:00<?, ?it/s]You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Token: 180 | Layer: 0 | Accuracy: 0.1
Token: 180 | Layer: 1 | Accuracy: 0.1
Token: 180 | Layer: 2 | Accuracy: 0.1
Token: 180 | Layer: 3 | Accuracy: 0.1
Token: 180 | Layer: 4 | Accuracy: 0.1
Token: 180 | Layer: 5 | Accuracy: 0.1
Token: 180 | Layer: 6 | Accuracy: 0.1
Token: 180 | Layer: 7 | Accuracy: 0.1
Token: 180 | Layer: 8 | Accuracy: 0.1
Token: 180 | Layer: 9 | Accuracy: 0.1
Token: 180 | Layer: 10 | Accuracy: 0.1
Token: 180 | Layer: 11 | Accuracy: 0.1
Token: 180 | Layer: 12 | Accuracy: 0.1
Token: 180 | Layer: 13 | Accuracy: 0.1
Token: 180 | Layer: 14 | Accuracy: 0.1
Token: 180 | Layer: 15 | Accuracy: 0.1
Token: 180 | Layer: 16 | Accuracy: 0.1
Token: 180 | Layer: 17 | Accuracy: 0.1
Token: 180 | Layer: 18 | Accuracy: 0.2
Token: 180 | Layer: 19 | Accuracy: 0.2
Token: 180 | Layer: 20 | Accuracy: 0.2
Token: 180 | Layer: 21 | Accuracy: 0.2
Token: 180 | Layer: 22 | Accuracy: 0.2
Token: 180 | Layer: 23 | Accuracy: 0.2
Token: 180 | Layer: 24 | Accuracy: 0.2
Token: 180 | Layer: 25 | Accuracy: 

  0%|          | 0/52 [01:45<?, ?it/s]


KeyboardInterrupt: 

# Character Token Tracing

In [10]:
n_samples = 10
batch_size = 10

dataset = get_character_tracing_exps(STORY_TEMPLATES,
                                    all_characters,
                                    all_containers,
                                    all_states,
                                    n_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [12]:
idx = 1
print(dataset[idx]['corrupt_prompt'], dataset[idx]['corrupt_ans'])
print(dataset[idx]['clean_prompt'], dataset[idx]['clean_ans'])
print(f"Target: '{dataset[idx]['target']}'")

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Nancy and Ray are working in a busy restaurant. To complete an order, Nancy grabs an opaque can and fills it with milk. Then Ray grabs another opaque pitcher and fills it with monster.
Question: What does Nancy believe the can contains?
Answer: milk
Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they p