In [1]:
# handle gpu leakage issue on DSI cluster
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from tqdm import tqdm
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from ast import literal_eval

import torch
from torch import Tensor
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset

import transformers
# from datasets import Dataset
from baukit import TraceDict

In [3]:
# Confirm GPUs are working properly and set default device explicitly
torch.cuda.empty_cache()
torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.
device = "cuda:0"
torch.cuda.device_count()

1

In [4]:
# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto").to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# load dictionary for decoding tokens
vocab = tokenizer.get_vocab()
id_to_token = {id: token for token, id in vocab.items()}

In [6]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [7]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [8]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [9]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 82, 4096)


In [10]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [11]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

(10, 82, 4096)


In [12]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


### Train Some Probes

In [36]:
LABELS = "categories" # choices are "title" and "categories"
DATA_FOLDER = "data/"
TRAIN_DATA = DATA_FOLDER + "train-10-articles.csv"
VAL_DATA = DATA_FOLDER + "val-10-articles.csv"

In [37]:
df_train = pd.read_csv(TRAIN_DATA)
df_val = pd.read_csv(VAL_DATA)

In [39]:
def get_fitted_label_encoder(df, labels=LABELS):
    if LABELS == "categories":
        from ast import literal_eval
        unique_labels = set()
        for item in df_train['categories'].tolist():
            categories = literal_eval(item)
            for cat in categories:
                unique_labels.update([cat])
        unique_labels = list(unique_labels)
    elif LABELS == "title":
        unique_labels = list(df_train['title'].drop_duplicates())
    
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)

    return label_encoder

In [40]:
label_encoder = get_fitted_label_encoder(df_train)
label_encoder.classes_

array(['1959 establishments in the United States',
       ':Anarchism by country', 'Agriculture', 'Agronomy', 'Alaska',
       'Alchemy', 'Algae', 'Amphibians', 'Amphibious organisms',
       'Anarchism', 'Animation', 'Anthropology', 'Anti-capitalism',
       'Anti-fascism', 'Appellate courts', 'Arctic Ocean',
       'Articles containing video clips', 'Astronomers', 'Astronomy',
       'Behavioural sciences', 'Beringia', 'Cartooning',
       'Common names of organisms', 'Courts by type',
       'Economic ideologies', 'Enclaves and exclaves',
       'Endosymbiotic events', 'Esotericism',
       'Exclaves in the United States',
       'Extant Late Devonian first appearances', 'Far-left politics',
       'Film and video technology', 'Food industry',
       'Former Russian colonies', 'Hermeticism', 'History of science',
       'Humans', 'Jurisdiction', 'Left-wing politics',
       'Libertarian socialism', 'Libertarianism', 'Natural philosophy',
       'Northern America', 'Political culture

In [41]:
class ActivationsDataset(Dataset):
    def __init__(self, Xs, ys, text):
        self.Xs = Xs
        self.ys = ys
        self.text = text

    def __len__(self):
        return len(self.Xs)

    def __getitem__(self, idx):
        return self.Xs[idx], self.ys[idx], self.text[idx]

In [42]:
def parse_categories(cat_list, label_encoder):
    encoded_categories = []
    for cat in cat_list:
        encoded_cat = label_encoder.transform(literal_eval(cat)).tolist()
        encoded_categories.append(encoded_cat)
    return encoded_categories

In [43]:
def df_to_dataset(df, model, label_encoder, layer=-1, aggregation="max", labels=LABELS, save=False, filename=DATA_FOLDER + "dataset.pt"):
    if labels=="title":
        df['label_encoded'] = label_encoder.transform(df['title'])
    elif labels=="categories":
        df['label_encoded'] = parse_categories(df['categories'].tolist(), label_encoder)

        def list_to_binary_vector(lst, dim=len(label_encoder.classes_)):
            return [1 if i in lst else 0 for i in range(dim)]

        # Assuming df['label_encoded'] is already a list of integers
        df['binary_labels'] = df['label_encoded'].apply(list_to_binary_vector)

    Xs = []
    ys = []
    text = []
    for i, row in df.iterrows():
        hidden_states = get_activations(row.text,tokenizer,model,device)
        if aggregation == "max":
            x = np.max(hidden_states[layer,:,:], axis=0)
        elif aggregation == "mean":
            x = np.mean(hidden_states[layer,:,:], axis=0)
        Xs.append(x)
        if labels == "categories":
            ys.append(row.binary_labels)
        elif labels == "title":
            ys.append(row.label_encoded)
        text.append(row.text)
    
    Xs_t = Tensor(np.asarray(Xs)).float()
    # TODO: maybe set this up so we get categories and titles in one dataset
    ys_t = Tensor(np.asarray(ys)).float() if labels == "categories" else Tensor(np.asarray(ys)).long()
    activation_dataset = ActivationsDataset(Xs_t, ys_t, text)

    # TODO: this probably doesn't work with the text, right?
    if save:
        torch.save(TensorDataset(Xs_t, ys_t), filename)

    return activation_dataset

In [44]:
LAYER = -1

In [45]:
train_dataset = df_to_dataset(df_train, model, label_encoder, aggregation="max", layer=LAYER)

In [46]:
train_loader = DataLoader(train_dataset, batch_size=4)

In [47]:
val_dataset = df_to_dataset(df_val, model, label_encoder, aggregation="max", layer=LAYER)

In [48]:
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [49]:
class Linear(nn.Module):
    def __init__(self, n_classes=len(label_encoder.classes_)):
        super().__init__()
        self.fc1 = nn.Linear(4096, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

In [50]:
class MLP(nn.Module):
    def __init__(self, n_classes=len(label_encoder.classes_)):
        super().__init__()
        self.fc1 = nn.Linear(4096, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [51]:
def init_weights(m):
    if type(m) == nn.Linear:
        init.xavier_normal_(m.weight)
        init.constant_(m.bias, 0)

In [52]:
MODEL = "linear" # choices are "linear" or "mlp"
if MODEL == "linear":
    probe = Linear()
elif MODEL == "mlp":
    probe = MLP()
probe.apply(init_weights)

Linear(
  (fc1): Linear(in_features=4096, out_features=56, bias=True)
)

In [53]:
if LABELS == "categories":
    criterion = BCEWithLogitsLoss(pos_weight=Tensor(torch.ones(len(label_encoder.classes_)) * 20))
elif LABELS == "title":
    criterion = nn.CrossEntropyLoss()

# TODO: what optimizer should we actually use?
# optimizer = optim.AdamW(probe.parameters(), lr=0.001)
optimizer = optim.SGD(probe.parameters(), lr=.001, momentum=.5)

In [54]:
EPOCHS = 20

In [67]:
predicted_labels = 0
for epoch in range(EPOCHS):  
    probe.train()
    train_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels, text = data
        optimizer.zero_grad()
        outputs = probe(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    if (epoch + 1) % 5 == 0:
        val_loss = 0.0
        total = 0
        correct = 0
        incorrect_examples = []
        probe.eval()
        with torch.no_grad():
            for i, data in enumerate(val_loader):  
                inputs, labels, text = data
                outputs = probe(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                if LABELS == "title":
                    _, predicted_labels = torch.max(outputs, 1)
                    total += labels[0]
                elif LABELS == "categories":
                    predicted_labels = (torch.sigmoid(outputs) > .5).int()
                    total += labels.numel()
                graded_preds = predicted_labels == labels
                if (~graded_preds).sum() > 0:
                    incorrect_examples.append(text)
                correct += (graded_preds).sum().item()

        print(f'[Training][{epoch + 1}] loss: {train_loss / len(train_loader):.3f}')
        print(f'[Validation][{epoch + 1}] loss: {val_loss / len(val_loader):.3f}') #TODO: I broke this somehow for title
        print(f'[Validation]{epoch + 1} accuracy: {correct / total:.3f}')
print("Incorrect examples on last val step: ", incorrect_examples)

[Training][5] loss: 0.060
[Validation][5] loss: 2.404
[Validation]5 accuracy: 0.900
[Training][10] loss: 0.056
[Validation][10] loss: 2.352
[Validation]10 accuracy: 0.903
[Training][15] loss: 0.053
[Validation][15] loss: 2.173
[Validation]15 accuracy: 0.908
[Training][20] loss: 0.051
[Validation][20] loss: 1.925
[Validation]20 accuracy: 0.911
Incorrect examples on last val step:  [('the largest state in the union and; one-fifth the size of the lower 48 states', 'members of a group of predominantly aquatic photosynthetic organisms of the kingdom Protista', 'the technique of photographing successive drawings or positions of puppets or models to create an illusion of movement when the movie is shown as a sequence.', 'theology dealing with the origin, nature, and destiny of human beings'), ('the broad term for everything that goes into growing crops and raising animals, to provide food and materials that people can use and enjoy', 'review the procedures and the decisions in the trial court

In [104]:
predicted_labels

tensor([[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
         1, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int32)

In [117]:
labels[0]

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.])

In [116]:
predicted_labels[0]

tensor([0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [127]:
false_negatives = []
false_positives = []
for i, (lbl, pred_lbl) in enumerate(zip(labels[1], predicted_labels[1])):
    if lbl == 1 and pred_lbl == 0:
        false_negatives.append(i)
    elif lbl == 0 and pred_lbl == 1:
        false_positives.append(i)

In [128]:
false_negatives

[]

In [129]:
false_positives

[6, 22, 26, 46]

In [130]:
label_encoder.inverse_transform(false_positives)

array(['Algae', 'Common names of organisms', 'Endosymbiotic events',
       'Polyphyletic groups'], dtype='<U45')

In [132]:
text[0]

'a medieval chemical science and speculative philosophy aiming to achieve the transmutation of the base metals into gold, the discovery of a universal cure for disease, and the discovery of a means of indefinitely prolonging life'