In [1]:
import torch
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import transformers
from baukit import TraceDict
import seaborn as sns
from matplotlib import pyplot as plt


torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model

# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")

device = "cuda"
r = model.to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.06s/it]


In [3]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [4]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [3]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    # input_ids = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)
    # attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)

    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    # print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [4]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

NameError: name 'prompts' is not defined

In [22]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [43]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

['<s>', '▁Below', '▁is', '▁an', '▁instruction', '▁that', '▁describes', '▁a', '▁task', ',', '▁pa', 'ired', '▁with', '▁an', '▁input', '▁that', '▁provides', '▁further', '▁context', '.', '▁Write', '▁a', '▁response', '▁that', '▁appropri', 'ately', '▁comple', 'tes', '▁the', '▁request', '.', '<0x0A>', '<0x0A>', '##', '#', '▁Inst', 'ruction', ':', '<0x0A>', 'Please', '▁say', '▁yes', '▁only', '▁if', '▁it', '▁costs', '▁between', '▁', '6', '.', '2', '6', '▁and', '▁', '9', '.', '3', '9', '▁dollars', ',', '▁otherwise', '▁no', '.', '<0x0A>', '<0x0A>', '##', '#', '▁Input', ':', '<0x0A>', '7', '.', '5', '5', '▁dollars', '<0x0A>', '<0x0A>', '##', '#', '▁Response', ':', '<0x0A>']
(10, 82, 4096)


In [24]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


If you have a supervised dataset of (prompts,labels), then to train a probe, you'll just replace the prompts with the activations in the learning objective. So you'll train a classifier on (activations,labels).

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor

In [18]:
df = pd.read_csv("data/parsed-paragraphs.csv")

In [23]:
df.head(2)

Unnamed: 0,text,label
0,Anarchism is a political philosophy and moveme...,Anarchism
1,Humans have lived in societies without formal ...,Anarchism


In [24]:
labels = df.label.unique()
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df.label)

In [37]:
Xs = []
ys = []

In [38]:
for i, row in df.iterrows():
    if i == 30:
        break

    hidden_states = get_activations(row.text,tokenizer,model,device)
    Xs.append(hidden_states[:,0,:]) # take only the <s> token
    ys.append(row.label_encoded)

In [72]:
example_dataset = TensorDataset(Tensor(Xs).float(), Tensor(ys).long())

In [74]:
torch.save(example_dataset, 'data/example_dataset.pt')

In [75]:
example_dataloader = DataLoader(example_dataset, batch_size=2)


In [76]:
Xs[0].shape

(33, 4096)

In [77]:
import torch.nn as nn
import torch.nn.functional as F

In [83]:
class MLP(nn.Module):
    def __init__(self, n_classes=len(df.label.unique())):
        super().__init__()
        self.fc1 = nn.Linear(33*4096, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [84]:
mlp = MLP()

In [85]:
mlp(Tensor(Xs[0]).unsqueeze(0))

tensor([[-5.1788,  0.0845,  0.8112, -6.7126, -3.1455, -3.9125, -8.5338,  0.3012,
         -6.7308, -5.3790,  5.6078,  1.9943,  8.7784, -8.7863, -6.5890, -3.0926,
          1.3507,  1.1338,  2.7981, -7.4665,  2.5234]],
       grad_fn=<AddmmBackward0>)

In [86]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(mlp.parameters(), lr=0.001, momentum=0.9)

In [87]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(example_dataloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mlp(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

In [91]:
mlp(Tensor(Xs[0]).unsqueeze(0))

tensor([[-0.0437,  0.0762, -0.0242, -0.0364, -0.0806, -0.0627,  0.0551,  0.0656,
          0.0888, -0.0265, -0.0760,  0.0467, -0.0970, -0.0256,  0.0118,  0.0825,
         -0.0585,  0.0680, -0.0526,  0.0680, -0.0553]],
       grad_fn=<AddmmBackward0>)