In [1]:
import json
import random
import os
import sys
import torch
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm
from typing import Any, List, Optional
from nnsight import CONFIG, LanguageModel
from collections import defaultdict

sys.path.append("../")
from src.dataset import SampleV3, DatasetV3, STORY_TEMPLATES
from src.utils import env_utils
from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(10)

CONFIG.set_default_api_key("d9e00ab7d4f74643b3176de0913f24a7")
os.environ["HF_TOKEN"] = "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"

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
CONFIG.APP.REMOTE_LOGGING = False

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
env.yml not found in /disk/u/nikhil/mind!
Setting MODEL_ROOT="". Models will now be downloaded to conda env cache, if not already there
Other defaults are set to:
    DATA_DIR = "data"
    RESULTS_DIR = "results"
    HPARAMS_DIR = "hparams"


# Loading Data

In [2]:
all_states = {}
all_containers= {}
all_characters = json.load(open(os.path.join(env_utils.DEFAULT_DATA_DIR, "synthetic_entities", "characters.json"), "r"))

for TYPE, DCT in {"states": all_states, "containers": all_containers}.items():
    ROOT = os.path.join(
        env_utils.DEFAULT_DATA_DIR, "synthetic_entities", TYPE
    )
    for file in os.listdir(ROOT):
        file_path = os.path.join(ROOT, file)
        with open(file_path, "r") as f:
            names = json.load(f)
        DCT[file.split(".")[0]] = names

# Loading Model

In [3]:
# model = LanguageModel("meta-llama/Meta-Llama-3.1-405B")
model = LanguageModel("meta-llama/Meta-Llama-3-70B-Instruct", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16, dispatch=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

# Steering

## Creating steeting vector

In [None]:
n_samples = 100
batch_size = 1

dataset = get_visibility_align_exps(STORY_TEMPLATES,
                             all_characters,
                             all_containers,
                             all_states,
                             n_samples,
                             question_type="belief_question",
                             diff_visibility=True)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
idx = 0
print(dataset[idx]['corrupt_prompt'], dataset[idx]['corrupt_ans'])
print(dataset[idx]['clean_prompt'], dataset[idx]['clean_ans'])
print(f"Target: {dataset[idx]['target']}")

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Zoe and Ruth are working in a busy restaurant. To complete an order, Zoe grabs an opaque vat and fills it with porter. Then Ruth grabs another opaque urn and fills it with cocoa. They are working side by side and can clearly observe each other's actions.
Question: What does Ruth believe the vat contains?
Answer: porter
Instruction: 1. Track the belief of each character as des

In [242]:
visibility_sent = [i for i in range(129, 152)]
content_sent = [i for i in range(152, 180)]
first_sent = [i for i in range(152, 168)]
second_sent = [i for i in range(168, 179)]
first_charac = [157, 158]
second_charac = [169, 170]
query_sent = [-8, -7]

input_tokens = model.tokenizer(dataset[idx]['corrupt_prompt'], return_tensors="pt").input_ids
print(model.tokenizer.decode(input_tokens[0][query_sent]))

 Ruth believe


In [294]:
n_layers = model.config.num_hidden_layers
vis_acts = torch.zeros(n_samples, len(query_sent), n_layers, model.config.hidden_size) # sample, token, layer, hidden_size
no_vis_acts = torch.zeros(n_samples, len(query_sent), n_layers, model.config.hidden_size)

for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    vis_prompt = batch['corrupt_prompt'][0]
    no_vis_prompt = batch['clean_prompt'][0]

    with torch.no_grad():

        with model.trace() as tracer:

            with tracer.invoke(vis_prompt):
                for layer_idx in range(n_layers):
                    for t_idx, t in enumerate([-8, -7]):
                        vis_acts[i, t_idx, layer_idx] = model.model.layers[layer_idx].output[0][0, t].cpu().save()

            with tracer.invoke(no_vis_prompt):
                for layer_idx in range(n_layers):
                    for t_idx, t in enumerate([-8, -7]):
                        no_vis_acts[i, t_idx, layer_idx] = model.model.layers[layer_idx].output[0][0, t].cpu().save()

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [07:22<00:00,  4.43s/it]


In [295]:
mean_acts = torch.empty(model.config.num_hidden_layers, model.config.hidden_size)

for layer_idx in range(n_layers):
    mean_acts[layer_idx] = torch.mean(vis_acts[:, :, layer_idx] - no_vis_acts[:, :, layer_idx], dim=(0, 1))

## Evaluating on same data distribution

In [647]:
for layer_idx in range(26, 30, 2):
    correct, total = 0, 0
    
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        vis_prompt = batch['corrupt_prompt'][0]
        no_vis_prompt = batch['clean_prompt'][0]
        target = batch['target'][0]

        with torch.no_grad():

            with model.trace() as tracer:

                with tracer.invoke(vis_prompt):
                    for l in range(layer_idx+1):
                        for t_idx, t in enumerate(query_sent):
                            model.model.layers[l].output[0][0, t] = model.model.layers[l].output[0][0, t] - mean_acts[l]

                    pred = model.lm_head.output[0, -1].argmax(dim=-1).save()
            
            print(f"Pred: {model.tokenizer.decode([pred]).lower().strip()} | Target: {target}")
            if model.tokenizer.decode([pred]).lower().strip() == target:
                correct += 1
            total += 1

            del pred
            torch.cuda.empty_cache()

    print(f"Layer: {layer_idx} | Accuracy: {round(correct/total, 2)}")

  0%|          | 0/100 [00:00<?, ?it/s]

  1%|          | 1/100 [00:02<04:32,  2.75s/it]

Pred: unknown | Target: champagne


  2%|▏         | 2/100 [00:05<04:29,  2.75s/it]

Pred: unknown | Target: monster


  3%|▎         | 3/100 [00:08<04:27,  2.76s/it]

Pred: unknown | Target: port


  4%|▍         | 4/100 [00:11<04:25,  2.76s/it]

Pred: unknown | Target: porter


  5%|▌         | 5/100 [00:13<04:22,  2.76s/it]

Pred: unknown | Target: juice


  5%|▌         | 5/100 [00:14<04:35,  2.90s/it]


KeyboardInterrupt: 

## Evaluation on BigToM

In [649]:
# Load a csv file
bigtom = pd.read_csv("../data/bigtom/0_forward_belief_false_belief/stories.csv", delimiter=";")

In [652]:
instruction: str = "1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output."

idx = 0
# prompt = f"Instruction: {instruction.strip()}\n\n"
prompt = f"Story: {bigtom.iloc[idx].story.strip()}\n"
# prompt += f"{bigtom.iloc[idx].question.strip()}\n"
prompt += f"Corollary: Noor believe the milk pitcher contains"
print(prompt)

Story: Noor is working as a barista at a busy coffee shop. Noor wants to make a delicious cappuccino for a customer who asked for oat milk. Noor grabs a milk pitcher and fills it with oat milk. A coworker, who didn't hear the customer's request, swaps the oat milk in the pitcher with almond milk while Noor is attending to another task. Noor does not see her coworker swapping the milk.
Corollary: Noor believe the milk pitcher contains


In [643]:
input_tokens = model.tokenizer(prompt, return_tensors="pt").input_ids
corollary_token = model.tokenizer.encode("Corollary", return_tensors="pt")[0, 1]
believe_token = model.tokenizer.encode(" believe", return_tensors="pt")[0, 1]

corollary_idx = -1
for i in range(len(input_tokens[0])):
    if input_tokens[0][i] == corollary_token:
        corollary_idx = i
        break

believe_idx = -1
for i in range(len(input_tokens[0])):
    if input_tokens[0][i] == believe_token and i > corollary_idx:
        believe_idx = i
        break

print(model.tokenizer.decode(input_tokens[0][corollary_idx+4:believe_idx+1]))

with torch.no_grad():
    with model.generate(prompt, max_new_tokens=10, do_sample=False):
        for l in range(35):
            for token_idx in range(corollary_idx+4, believe_idx+2):
                model.model.layers[l].output[0][0, token_idx] = model.model.layers[l].output[0][0, token_idx] + 10*mean_acts[l]

        out = model.generator.output.save()

    print(model.tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


 Ravi believe


<|begin_of_text|>Story: Ravi is a farmer in a small Indian village, tending to his mango orchard. Ravi wants to harvest ripe mangoes to sell at the local market. He notices a mango tree with a heavy branch laden with ripe mangoes. A mischievous monkey comes along and eats all the ripe mangoes on the branch while Ravi is gathering his harvesting tools. Ravi sees the monkey eating the ripe mangoes.
Corollary: Ravi believe the branch has no ripe mangoes left. Ravi is left


# Probing

In [4]:
n_samples = 100
batch_size = 1

configs = []
for _ in range(n_samples):
    template = STORY_TEMPLATES['templates'][0]
    characters = random.sample(all_characters, 2)
    containers = random.sample(all_containers[template["container_type"]], 2)
    states = random.sample(all_states[template["state_type"]], 2)
    event_idx = None
    event_noticed = False

    sample = SampleV3(
        template=template,
        characters=characters,
        containers=containers,
        states=states,
        visibility=False,
        event_idx=event_idx,
        event_noticed=event_noticed,
    )
    configs.append(sample)

dataset = DatasetV3(configs)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [5]:
idx = 0
print(dataset[idx]['prompt'], dataset[idx]['target'])

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Max and Karen are working in a busy restaurant. To complete an order, Max grabs an opaque tun and fills it with port. Then Karen grabs another opaque dispenser and fills it with water. They are working in the entirely separate sections, with no visibility between them.
Question: What does Karen believe the tun contains?
Answer: water


In [38]:
input_tokens = model.tokenizer(dataset[idx]['prompt'], return_tensors="pt").input_ids
print(model.tokenizer.decode(input_tokens[0][[146, 158]]))

 Max Karen


In [39]:
probing_layer = 20
n_layers = 40

charac_indices = [146, 158]
object_indices = [150, 162]
state_indices = [155, 167]
first_sent = [i for i in range(141, 157)]
second_sent = [i for i in range(158, 169)]

acts = torch.zeros(n_samples, n_layers, len(first_sent)+len(second_sent), model.config.hidden_size)

for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    prompt = batch['prompt'][0]
    target = batch['target'][0]

    with torch.no_grad():
        with model.trace(prompt):
            for layer_idx in range(n_layers):
                for t_idx, t in enumerate(first_sent+second_sent):
                    acts[i, layer_idx, t_idx] = model.model.layers[layer_idx].output[0][0, t].cpu().save()

 30%|███       | 30/100 [01:24<03:17,  2.82s/it]


KeyboardInterrupt: 

In [None]:
class ProbingClassifier(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProbingClassifier, self).__init__()
        self.fc = torch.nn.Linear(input_dim, output_dim)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc(x)
        return self.sigmoid(x)

In [143]:
# Create a training data using cached activations
training_samples = []
labels = []

for i in range(200):
    for t_idx, t in enumerate(first_sent+second_sent):
        if t == 161:
            labels.append(torch.tensor(1))
            training_samples.append(acts[i, 20, t_idx, :])
        elif t in [173] + charac_indices + state_indices:
            labels.append(torch.tensor(0))
            training_samples.append(acts[i, 20, t_idx, :])

In [144]:
# Create a dataloader using training_samples and labels
training_samples = torch.stack(training_samples)
labels = torch.stack(labels)
train_data = torch.utils.data.TensorDataset(training_samples, labels)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=10, shuffle=True)

In [146]:
# Train a probing classifier
classifier = ProbingClassifier(model.config.hidden_size, 1)
classifier.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001)

n_epochs = 50
for epoch in range(n_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device).unsqueeze(1)

        optimizer.zero_grad()
        outputs = classifier(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss.item()}")
    
    del inputs, labels, outputs
    torch.cuda.empty_cache()

Epoch: 0 | Loss: 0.010615534149110317


Epoch: 10 | Loss: 0.002030023140832782
Epoch: 20 | Loss: 0.0003757626691367477
Epoch: 30 | Loss: 0.00026621154393069446
Epoch: 40 | Loss: 8.729223918635398e-05


In [147]:
batch_size = 1

configs = []
for _ in range(5):
    template = STORY_TEMPLATES['templates'][1]
    characters = random.sample(all_characters, 2)
    containers = random.sample(all_containers[template["container_type"]], 2)
    states = random.sample(all_states[template["state_type"]], 2)
    event_idx = None
    event_noticed = False

    sample = SampleV3(
        template=template,
        characters=characters,
        containers=containers,
        states=states,
        visibility=True,
        event_idx=event_idx,
        event_noticed=event_noticed,
    )
    configs.append(sample)

dataset = DatasetV3(configs)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [148]:
idx = 1
print(dataset[idx]['prompt'], dataset[idx]['target'])

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Lee and Laura are working in a busy restaurant side by side and can clearly observe each other's actions. To complete an order, Lee grabs an opaque quart and fills it with beer. Then Laura grabs another opaque pint and fills it with rum.
Question: What does Laura believe the quart contains?
Answer: beer


In [149]:
# Predict classifier output for the query character
with torch.no_grad():
    for layer_idx in range(0, 40, 10):
        preds = []
        for data in tqdm(dataloader, total=len(dataloader)):
            prompt = data['prompt'][0]
            target = data['target'][0]

            with model.trace(prompt):
                query_charac_act = model.model.layers[layer_idx].output[0][0, -8].save()
            
            query_charac = query_charac_act.unsqueeze(0)
            query_charac = query_charac.to(device)
            query_charac = query_charac.float()

            output = classifier(query_charac)
            # print(output)
            preds.append(output)

            del query_charac_act, query_charac, output
            torch.cuda.empty_cache()
        
        print(f"Layer: {layer_idx} | Output: {sum(preds)/len(preds)}")

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:12<00:00,  2.43s/it]


Layer: 0 | Output: tensor([[0.4794]], device='cuda:0')


100%|██████████| 5/5 [00:12<00:00,  2.46s/it]


Layer: 10 | Output: tensor([[0.2065]], device='cuda:0')


100%|██████████| 5/5 [00:12<00:00,  2.48s/it]


Layer: 20 | Output: tensor([[0.0002]], device='cuda:0')


100%|██████████| 5/5 [00:12<00:00,  2.49s/it]

Layer: 30 | Output: tensor([[0.0056]], device='cuda:0')





In [84]:
print(prompt)

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Scott and Olivia are working in the entirely separate sections of a busy restaurant, with no visibility between them. To complete an order, Scott grabs an opaque bottle and fills it with gin. Then Olivia grabs another opaque mug and fills it with cocoa.
Question: What does Olivia believe the mug contains?
Answer:
