In [1]:
# from datasets import load_dataset
# import seaborn as sns
from tqdm import tqdm
import numpy as np
import transformers
from baukit import TraceDict
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model

# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")

device = "cuda"
r = model.to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Loading checkpoint shards: 100%|██████████| 3/3 [01:53<00:00, 37.67s/it]


In [3]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [4]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [6]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    # input_ids = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)
    # attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)

    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    # print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [7]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 82, 4096)


In [8]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [9]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

(10, 82, 4096)


In [10]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


If you have a supervised dataset of (prompts,labels), then to train a probe, you'll just replace the prompts with the activations in the learning objective. So you'll train a classifier on (activations,labels).

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer

In [44]:
df = pd.read_csv("data/parsed-paragraphs.csv")

In [45]:
df.head(2)

Unnamed: 0,text,label
0,Anarchism is a political philosophy and moveme...,Anarchism by country|Anarchism|Anti-capitalism...
1,Humans have lived in societies without formal ...,Anarchism by country|Anarchism|Anti-capitalism...


In [46]:
df['label_list'] = df['label'].apply(lambda x: x.split('|'))

In [47]:
df.head(2)

Unnamed: 0,text,label,label_list
0,Anarchism is a political philosophy and moveme...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital..."
1,Humans have lived in societies without formal ...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital..."


In [48]:
all_labels = [item for sublist in df['label_list'].tolist() for item in sublist]
unique_labels = list(set(all_labels))

In [49]:
unique_labels

['Socialism',
 'Anarchism',
 'Anti-fascism',
 'Libertarian socialism',
 'Political ideologies',
 'Anarchism by country',
 'Left-wing politics',
 'Political culture',
 'Anti-capitalism',
 'Far-left politics',
 'Economic ideologies',
 'Political movements',
 'Social theories',
 'Libertarianism']

In [50]:
label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)


In [51]:
df['label_encoded'] = df['label_list'].apply(lambda x: label_encoder.transform(x).tolist())

In [76]:
df.head(2)

Unnamed: 0,text,label,label_list,label_encoded
0,Anarchism is a political philosophy and moveme...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital...","[1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]"
1,Humans have lived in societies without formal ...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital...","[1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]"


In [84]:
def list_to_binary_vector(lst, dim=len(unique_labels)):
    return [1 if i in lst else 0 for i in range(dim)]

# Assuming df['label_encoded'] is already a list of integers
df['binary_labels'] = df['label_encoded'].apply(list_to_binary_vector)

In [85]:
df.head(2)

Unnamed: 0,text,label,label_list,label_encoded,binary_labels
0,Anarchism is a political philosophy and moveme...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital...","[1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,Humans have lived in societies without formal ...,Anarchism by country|Anarchism|Anti-capitalism...,"[Anarchism by country, Anarchism, Anti-capital...","[1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [53]:
Xs = []
ys = []

In [86]:
for i, row in df.iterrows():
    if i == 30:
        break

    hidden_states = get_activations(row.text,tokenizer,model,device)
    Xs.append(hidden_states[:,0,:]) # take only the <s> token
    ys.append(row.binary_labels)

In [87]:
example_dataset = TensorDataset(Tensor(Xs).float(), Tensor(np.asarray(ys)).float())

In [88]:
torch.save(example_dataset, 'data/example_dataset.pt')

In [89]:
example_dataloader = DataLoader(example_dataset, batch_size=2)

In [90]:
class Linear(nn.Module):
    def __init__(self, n_classes=len(df.label.unique())):
        super().__init__()
        self.fc1 = nn.Linear(33*4096, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

In [91]:
class MLP(nn.Module):
    def __init__(self, n_classes=len(unique_labels)):
        super().__init__()
        self.fc1 = nn.Linear(33*4096, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [92]:
mlp = MLP()

In [93]:
mlp(Tensor(Xs[0]).unsqueeze(0))

tensor([[-0.8192,  9.0358, -3.3105, -2.2747,  7.4734, -2.3725, -8.8725,  2.0786,
         -6.4047, -2.0546,  2.1794, -1.9107,  0.6721,  0.1313]],
       grad_fn=<AddmmBackward0>)

In [94]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(mlp.parameters(), lr=0.001)

In [95]:
for epoch in range(2):  
    running_loss = 0.0
    for i, data in enumerate(example_dataloader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mlp(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

In [96]:
mlp(Tensor(Xs[0]).unsqueeze(0))

tensor([[ 504.6013,  249.3814,  754.3988,  847.3753, 1278.9502, 1017.6917,
         1233.6486, 2113.8892, 1778.5347, 2477.1062, 2384.1436, 1964.4608,
         1537.4293, 3582.2012]], grad_fn=<AddmmBackward0>)