In [1]:
# from datasets import load_dataset
# import seaborn as sns
from tqdm import tqdm
import numpy as np
import transformers
from baukit import TraceDict
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the model

# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto")

device = "cuda"
r = model.to(device)

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
Loading checkpoint shards: 100%|██████████| 3/3 [01:54<00:00, 38.10s/it]


In [3]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [4]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [6]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    # input_ids = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").input_ids.to(device)
    # attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)

    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    # print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0]))

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [7]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 82, 4096)


In [8]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(33, 82, 4096)


In [9]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

(10, 82, 4096)


In [10]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(33, 10, 82, 4096)


If you have a supervised dataset of (prompts,labels), then to train a probe, you'll just replace the prompts with the activations in the learning objective. So you'll train a classifier on (activations,labels).

In [148]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset, DataLoader
from torch import Tensor
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn import BCEWithLogitsLoss

In [14]:
df_train = pd.read_csv("data/parsed-paragraphs-train.csv")
df_test = pd.read_csv("data/parsed-paragraphs-test.csv")

In [82]:
# TODO: hacky way to get unique labels - set up pipeline to actually handle this well

from ast import literal_eval

unique_labels = set()
for item in df_train['label_list'].tolist():
    for sub_item in item:
        labels = literal_eval(sub_item)
        unique_labels.update(labels)

unique_labels = list(unique_labels)

In [83]:
for item in df_train['label_list'].tolist():
    for sub_item in item:
        labels = literal_eval(sub_item)
        print(labels)
        break


[':Anarchism by country', 'Anarchism', 'Anti-capitalism', 'Anti-fascism', 'Economic ideologies', 'Far-left politics', 'Left-wing politics', 'Libertarian socialism', 'Libertarianism', 'Political culture', 'Political ideologies', 'Political movements', 'Social theories', 'Socialism']
[':Anarchism by country', 'Anarchism', 'Anti-capitalism', 'Anti-fascism', 'Economic ideologies', 'Far-left politics', 'Left-wing politics', 'Libertarian socialism', 'Libertarianism', 'Political culture', 'Political ideologies', 'Political movements', 'Social theories', 'Socialism']
['Land surface effects on climate', 'Climate change feedbacks', 'Climate forcing', 'Climatology', 'Electromagnetic radiation', 'Meteorological quantities', 'Radiometry', 'Scattering, absorption and radiative transfer (optics)', 'Radiation', '1760s neologisms']
['Land surface effects on climate', 'Climate change feedbacks', 'Climate forcing', 'Climatology', 'Electromagnetic radiation', 'Meteorological quantities', 'Radiometry', 'Sc

In [110]:
def df_to_dataloader(df, save=False):
    df['label_list'] = df['label'].apply(lambda x: x.split('|'))
    unique_labels = set()
    for item in df_train['label_list'].tolist():
        for sub_item in item:
            labels = literal_eval(sub_item)
            unique_labels.update(labels)

    unique_labels = list(unique_labels)

    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)

    encoded_labels = []
    for item in df_train['label_list'].tolist():
        for sub_item in item:
            labels = literal_eval(sub_item)
            encoded_labels.append(label_encoder.transform(labels).tolist())
    df['label_encoded'] = encoded_labels

    def list_to_binary_vector(lst, dim=len(unique_labels)):
        return [1 if i in lst else 0 for i in range(dim)]

    # Assuming df['label_encoded'] is already a list of integers
    df['binary_labels'] = df['label_encoded'].apply(list_to_binary_vector)

    Xs = []
    ys = []
    for i, row in df.iterrows():
        hidden_states = get_activations(row.text,tokenizer,model,device)
        Xs.append(hidden_states[:,0,:]) # take only the <s> token
        ys.append(row.binary_labels)

    tensor_dataset = TensorDataset(Tensor(Xs).float(), Tensor(np.asarray(ys)).float())

    if save:
        torch.save(tensor_dataset, 'data/example_dataset.pt')

    return DataLoader(tensor_dataset, batch_size=4, shuffle=True)

In [111]:
train_loader = df_to_dataloader(df_train)
test_loader = df_to_dataloader(df_test)

In [112]:
class Linear(nn.Module):
    def __init__(self, n_classes=len(unique_labels)):
        super().__init__()
        self.fc1 = nn.Linear(33*4096, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

In [113]:
class MLP(nn.Module):
    def __init__(self, n_classes=len(unique_labels)):
        super().__init__()
        self.fc1 = nn.Linear(33*4096, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [173]:
import torch.nn.init as init

def init_weights(m):
    if type(m) == nn.Linear:
        init.xavier_normal_(m.weight)
        init.constant_(m.bias, 0)

In [175]:
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=.5, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        return F_loss.mean()

In [201]:
linear = Linear()
mlp = MLP()
mlp.apply(init_weights)
linear.apply(init_weights)

Linear(
  (fc1): Linear(in_features=135168, out_features=294, bias=True)
)

In [202]:
import torch.optim as optim

# criterion = BCEWithLogitsLoss(pos_weight=Tensor(torch.ones(len(unique_labels)) * 20))
optimizer = optim.AdamW(mlp.parameters(), lr=0.001)

In [207]:
predicted_labels = 0
for epoch in range(50):  
    mlp.train()
    train_loss = 0.0
    for i, data in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = mlp(inputs)
        pos_labels = labels.sum().item()
        neg_labels = labels.numel() - pos_labels
        criterion = BCEWithLogitsLoss(pos_weight=Tensor(torch.ones(len(unique_labels)) * pos_labels/neg_labels))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    if (epoch + 1) % 5 == 0:
        val_loss = 0.0
        val_count = 0
        predicted_labels = 0
        mlp.eval()
        with torch.no_grad():
            for i, data in enumerate(test_loader):  
                inputs, labels = data
                outputs = mlp(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                val_count += 1
                predicted_labels += (torch.sigmoid(outputs) > .5).int().sum().item()

        print(f'[Training][{epoch + 1}] loss: {train_loss / len(train_loader):.3f}')
        print(f'[Validation][{epoch + 1}] loss: {val_loss / val_count:.3f}')
        print(f'[Predicted Validation Labels]: {predicted_labels}')
    
    if predicted_labels > 10000:
        break

[Training][5] loss: 0.657
[Validation][5] loss: 0.657
[Predicted Validation Labels]: 120
[Training][10] loss: 0.646
[Validation][10] loss: 0.646
[Predicted Validation Labels]: 120
[Training][15] loss: 0.636
[Validation][15] loss: 0.636
[Predicted Validation Labels]: 120
[Training][20] loss: 0.624
[Validation][20] loss: 0.622
[Predicted Validation Labels]: 120
[Training][25] loss: 0.613
[Validation][25] loss: 0.613
[Predicted Validation Labels]: 120
[Training][30] loss: 0.602
[Validation][30] loss: 0.600
[Predicted Validation Labels]: 120
[Training][35] loss: 0.590
[Validation][35] loss: 0.587
[Predicted Validation Labels]: 120
[Training][40] loss: 0.579
[Validation][40] loss: 0.577
[Predicted Validation Labels]: 0
[Training][45] loss: 0.567
[Validation][45] loss: 0.564
[Predicted Validation Labels]: 0
[Training][50] loss: 0.555
[Validation][50] loss: 0.553
[Predicted Validation Labels]: 0


In [206]:
labels.shape

torch.Size([4, 294])

In [155]:
data, label = next(iter(test_loader))

In [159]:
label[0].int().sum().item()

2

In [161]:
(torch.sigmoid(mlp(data))[0] > .5).int().sum().item()

294

In [163]:
(torch.sigmoid(mlp(data))[0] > .5).int().item()

RuntimeError: a Tensor with 294 elements cannot be converted to Scalar