In [1]:
import torch
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import transformers
from baukit import TraceDict
import seaborn as sns
from matplotlib import pyplot as plt


torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model

# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")

device = "cuda"
r = model.to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Loading checkpoint shards: 100%|██████████| 3/3 [01:55<00:00, 38.47s/it]


In [3]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [4]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [45]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    # input_ids = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)
    # attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)

    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    # print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [21]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 82, 4096)


In [22]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [43]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

['<s>', '▁Below', '▁is', '▁an', '▁instruction', '▁that', '▁describes', '▁a', '▁task', ',', '▁pa', 'ired', '▁with', '▁an', '▁input', '▁that', '▁provides', '▁further', '▁context', '.', '▁Write', '▁a', '▁response', '▁that', '▁appropri', 'ately', '▁comple', 'tes', '▁the', '▁request', '.', '<0x0A>', '<0x0A>', '##', '#', '▁Inst', 'ruction', ':', '<0x0A>', 'Please', '▁say', '▁yes', '▁only', '▁if', '▁it', '▁costs', '▁between', '▁', '6', '.', '2', '6', '▁and', '▁', '9', '.', '3', '9', '▁dollars', ',', '▁otherwise', '▁no', '.', '<0x0A>', '<0x0A>', '##', '#', '▁Input', ':', '<0x0A>', '7', '.', '5', '5', '▁dollars', '<0x0A>', '<0x0A>', '##', '#', '▁Response', ':', '<0x0A>']
(10, 82, 4096)


In [24]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


If you have a supervised dataset of (prompts,labels), then to train a probe, you'll just replace the prompts with the activations in the learning objective. So you'll train a classifier on (activations,labels).

In [40]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv("parsed-paragraphs.csv")

In [59]:
activations = {}

In [60]:
for i, row in df.iterrows():
    if i == 30:
        break

    hidden_states = get_activations(row.text,tokenizer,model,device)

    if row.label not in activations:
        activations[row.label] = []

    activations[row.label].append(hidden_states[:,0,:]) # take only the <s> token

In [61]:
activations

{'Anarchism': [array([[ 1.4793e-02, -1.2634e-02,  1.2726e-02, ...,  1.3229e-02,
           7.2594e-03, -4.4703e-04],
         [-5.3833e-02, -1.1816e-01, -5.8777e-02, ...,  3.9520e-03,
           5.9967e-03,  1.4502e-01],
         [ 1.1810e-01, -6.7993e-02, -4.9286e-02, ...,  4.5837e-02,
          -1.4771e-01,  2.2168e-01],
         ...,
         [ 1.7783e+00,  4.1870e-02,  3.8062e-01, ...,  2.1179e-02,
          -2.3303e-01,  1.8154e+00],
         [-1.6504e-01,  5.8472e-02, -4.4653e-01, ..., -1.5771e-01,
          -3.2373e-01,  4.4141e-01],
         [ 1.5308e-01,  2.3840e-01, -1.6248e-01, ...,  1.4099e-01,
           8.4351e-02,  8.7598e-01]], dtype=float16),
  array([[ 1.4793e-02, -1.2634e-02,  1.2726e-02, ...,  1.3229e-02,
           7.2594e-03, -4.4703e-04],
         [-5.3772e-02, -1.1816e-01, -5.8716e-02, ...,  3.9368e-03,
           5.9814e-03,  1.4502e-01],
         [ 1.1810e-01, -6.7993e-02, -4.9225e-02, ...,  4.5868e-02,
          -1.4771e-01,  2.2168e-01],
         ...,
      

In [62]:
for k, v in activations.items():
    np.savez_compressed(f'data/activations_{k}.npz', *np.asarray(v), labels=k)