In [1]:
# handle gpu leakage issue on DSI cluster
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from tqdm import tqdm
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder

import torch
from torch import Tensor
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import transformers
from datasets import Dataset
from baukit import TraceDict


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
torch.cuda.empty_cache()
torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.
device = "cuda:0"
torch.cuda.device_count()

1

In [4]:
# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto").to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Loading checkpoint shards: 100%|██████████| 3/3 [00:12<00:00,  4.24s/it]


In [5]:
# load dictionary for decoding tokens
vocab = tokenizer.get_vocab()
id_to_token = {id: token for token, id in vocab.items()}

In [6]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [7]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [8]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [9]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 82, 4096)


In [10]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [11]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

(10, 82, 4096)


In [12]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


In [13]:
# Check outputs
test1 = get_activations("Circus peanuts", tokenizer, model, device)
test2 = get_activations("High dimensional vectors and machine learning", tokenizer, model, device)

In [14]:
test1.shape, test2.shape

((33, 6, 4096), (33, 7, 4096))

### Train Some Probes

In [30]:
LABELS = "title" # other choice is "categories"
DATA_FOLDER = "data/"
TRAIN_DATA = DATA_FOLDER + "train-10-articles.csv"
VAL_DATA = DATA_FOLDER + "val-10-articles.csv"

In [16]:
df_train = pd.read_csv(TRAIN_DATA)
df_val = pd.read_csv(VAL_DATA)

In [17]:
# TODO: hacky way to get unique labels - set up pipeline to actually handle this well
if LABELS == "categories":
    from ast import literal_eval

    unique_labels = set()
    for item in df_train['label_list'].tolist():
        for sub_item in item:
            labels = literal_eval(sub_item)
            unique_labels.update(labels)

    unique_labels = list(unique_labels)
elif LABELS == "title":
    unique_labels = list(df_train['title'].drop_duplicates())

In [18]:
unique_labels

['Anarchism',
 'Anthropology',
 'Alchemy',
 'Astronomer',
 'Animation',
 'Amphibian',
 'Alaska',
 'Agriculture',
 'Algae',
 'Appellate court']

In [64]:
def df_to_dataloader(df, model, labels=LABELS, save=False):
    if labels=="title":
        unique_labels = list(df_train['title'].drop_duplicates())
        label_encoder = LabelEncoder()
        df['label_encoded'] = label_encoder.fit_transform(df['title'])
    elif labels=="categories":
        df['label_list'] = df['label'].apply(lambda x: x.split('|'))
        unique_labels = set()
        for item in df_train['label_list'].tolist():
            for sub_item in item:
                labels = literal_eval(sub_item)
                unique_labels.update(labels)

        unique_labels = list(unique_labels)

        label_encoder = LabelEncoder()
        label_encoder.fit(unique_labels)

        encoded_labels = []
        for item in df_train['label_list'].tolist():
            for sub_item in item:
                labels = literal_eval(sub_item)
                encoded_labels.append(label_encoder.transform(labels).tolist())
        df['label_encoded'] = encoded_labels

        def list_to_binary_vector(lst, dim=len(unique_labels)):
            return [1 if i in lst else 0 for i in range(dim)]

        # Assuming df['label_encoded'] is already a list of integers
        df['binary_labels'] = df['label_encoded'].apply(list_to_binary_vector)

    Xs = []
    ys = []
    for i, row in df.iterrows():
        hidden_states = get_activations(row.text,tokenizer,model,device)
        # take only the activations from the final transformer layer
        # TODO: set up principled way of doing mean, max, or some sort of pooling
        # Xs.append(np.mean(hidden_states[-1,:,:], axis=0))
        Xs.append(np.max(hidden_states[-1,:,:], axis=0))
        if labels == "categories":
            ys.append(row.binary_labels)
        elif labels == "title":
            ys.append(row.label_encoded)
    
    # TODO: fix this so you don't get the warning about this being slow
    Xs_t = Tensor(Xs).float()
    ys_t = Tensor(np.asarray(ys)).float() if labels == "categories" else Tensor(np.asarray(ys)).long()

    tensor_dataset = TensorDataset(Xs_t, ys_t)

    # TODO: add an argument for saving filename
    if save:
        torch.save(tensor_dataset, 'data/example_dataset.pt')

    return DataLoader(tensor_dataset, batch_size=4, shuffle=True)

In [65]:
train_loader = df_to_dataloader(df_train, model)
val_loader = df_to_dataloader(df_val, model)

In [48]:
class Linear(nn.Module):
    def __init__(self, n_classes=len(unique_labels)):
        super().__init__()
        self.fc1 = nn.Linear(4096, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

In [49]:
class MLP(nn.Module):
    def __init__(self, n_classes=len(unique_labels)):
        super().__init__()
        self.fc1 = nn.Linear(4096, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [50]:
def init_weights(m):
    if type(m) == nn.Linear:
        init.xavier_normal_(m.weight)
        init.constant_(m.bias, 0)

In [66]:
MODEL = "mlp" # other choice is "linear"
if MODEL == "linear":
    probe = Linear()
elif MODEL == "mlp":
    probe = MLP()
probe.apply(init_weights)

MLP(
  (fc1): Linear(in_features=4096, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=10, bias=True)
)

In [67]:
if LABELS == "categories":
    criterion = BCEWithLogitsLoss(pos_weight=Tensor(torch.ones(len(unique_labels)) * 20))
elif LABELS == "title":
    criterion = nn.CrossEntropyLoss()

# TODO: what optimizer should we actually use
# optimizer = optim.AdamW(probe.parameters(), lr=0.001)
optimizer = optim.SGD(probe.parameters(), lr=.001, momentum=.5)

In [68]:
EPOCHS = 20

In [69]:
predicted_labels = 0
for epoch in range(EPOCHS):  
    probe.train()
    train_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = probe(inputs)
        if torch.all(labels != labels[0]):
            print("All labels are not the same: ", labels)
        if torch.all(outputs != outputs[0]):
            print("All outputs are not the same: ", outputs)
        if LABELS == "categories":
            pos_labels = labels.sum().item()
            neg_labels = labels.numel() - pos_labels
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if (epoch + 1) % 5 == 0:
        val_loss = 0.0
        val_count = 0
        predicted_labels = 0
        probe.eval()
        with torch.no_grad():
            for i, data in enumerate(val_loader):  
                inputs, labels = data
                outputs = probe(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_count += 1
                if LABELS == "categories":
                    predicted_labels += (torch.sigmoid(outputs) > .5).int().sum().item()

        print(f'[Training][{epoch + 1}] loss: {train_loss / len(train_loader):.3f}')
        print(f'[Validation][{epoch + 1}] loss: {val_loss / val_count:.3f}')
        if LABELS == "categories":
            print(f'[Predicted Validation Labels]: {predicted_labels}')

[Training][5] loss: 1.298
[Validation][5] loss: 2.501
[Training][10] loss: 0.259
[Validation][10] loss: 3.247
[Training][15] loss: 0.058
[Validation][15] loss: 4.064
[Training][20] loss: 0.019
[Validation][20] loss: 4.254


In [70]:
correct = 0
for i, (inputs, labels) in enumerate(val_loader):
    preds = torch.max(probe(inputs), dim=1)[1]
    correct += (preds == labels).sum()
    print(f"Batch {i+1}")
    print("Labels:", labels)
    print("Predicted Labels:", preds)
accuracy = (correct/len(val_loader.dataset)).item() * 100
print(f"Accuracy: {accuracy:.2f}")

Batch 1
Labels: tensor([7, 5, 3, 6])
Predicted Labels: tensor([7, 6, 3, 2])
Batch 2
Labels: tensor([1, 1, 4, 9])
Predicted Labels: tensor([1, 1, 5, 9])
Batch 3
Labels: tensor([0, 3, 6, 0])
Predicted Labels: tensor([0, 3, 7, 0])
Batch 4
Labels: tensor([1, 9, 3, 7])
Predicted Labels: tensor([1, 3, 3, 8])
Batch 5
Labels: tensor([4, 2, 5, 0])
Predicted Labels: tensor([5, 2, 3, 0])
Batch 6
Labels: tensor([5, 2, 4, 6])
Predicted Labels: tensor([6, 2, 5, 7])
Batch 7
Labels: tensor([7, 2, 8])
Predicted Labels: tensor([5, 2, 9])
Accuracy: 51.85
