In [1]:
# handle gpu leakage issue on DSI cluster
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from tqdm import tqdm
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from ast import literal_eval

import torch
from torch import Tensor
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, Dataset

import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import Dataset
from baukit import TraceDict

In [3]:
# Confirm GPUs are working properly and set default device explicitly
torch.cuda.empty_cache()
torch.cuda.set_device(0) # Sets the default device for tensors to be the first GPU.
device = "cuda:0"
torch.cuda.device_count()

1

In [None]:
# MODEL = "/net/projects/veitch/LLMs/llama2-based-models/llama2-hf/Llama-2-7b-chat-hf"
MODEL = "/net/projects/veitch/LLMs/llama1-based-models/alpaca-7b"

tokenizer = transformers.LlamaTokenizer.from_pretrained(MODEL)
model = transformers.LlamaForCausalLM.from_pretrained(MODEL, low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map="auto").to(device)

In [50]:
# ROME model and tokenizer
MODEL_NAME = "gpt2-xl"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer_rome.eos_token

In [51]:
# load dictionary for decoding tokens
vocab = tokenizer.get_vocab()
id_to_token = {id: token for token, id in vocab.items()}

In [52]:
# Any prompt will do to demonstrate. This prompt is 82 tokens long.

template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between {:.2f} and {:.2f} dollars, otherwise no.

### Input:
{:.2f} dollars

### Response:
"""

prompt = template.format(3.22,5.76,9.30)
print(prompt)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Please say yes only if it costs between 3.22 and 5.76 dollars, otherwise no.

### Input:
9.30 dollars

### Response:



In [53]:
# Just generate some prompts for demonstration
# You'd probably save a dataset of prompts, and load those from a file.

def generate_prompts(template,n=1000,include_bounds=False):
    "Replicates the same distribution as BDAS paper."
    for i in range(n):
        # Generate the lower bound, upper bound, and input value
        lower_bound = np.round(np.random.uniform(0.00,7.49),2)
        max_ub = np.min([lower_bound+7.5,9.99])
        upper_bound = np.round(np.random.uniform(lower_bound+2.5,max_ub),2)
        diff = np.round(upper_bound - lower_bound,2)
        assert 2.5 <= diff and diff <= 7.5, (lower_bound, max_ub, upper_bound, diff)
        input_value = np.round(np.random.uniform(0.00,9.99),2)

        # Generate the prompt
        prompt = template.format(lower_bound,upper_bound,input_value)
        if include_bounds:
            yield (lower_bound,upper_bound,input_value,prompt)
        else:
            yield prompt

prompts = [p for p in generate_prompts(template,n=10)]

In [54]:
def get_activations(prompts,tokenizer,model,device,layer="all"):
    """Returns a Numpy array of residual stream activations. 
    Based on https://github.com/likenneth/honest_llama
    
    David's uncertainties: I think these are the activations before the MLP sublayer?
    """
    tokenized = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
    input_ids = tokenized.input_ids.to(device)
    attention_mask = tokenized.attention_mask.to(device)

    model.eval()
    outputs = model(
        input_ids,
        attention_mask=attention_mask, output_hidden_states = True
    )
    hidden_states = outputs.hidden_states
    if layer == "all":
         # (num_layers, batch_size, seq_length, hidden_dim)
        hidden_states = torch.stack(hidden_states, dim = 0).squeeze()
        hidden_states = hidden_states.detach().cpu().numpy()
    else:
         # (batch_size, seq_length, hidden_dim)
        hidden_states = hidden_states[layer].detach().cpu().numpy()
    return hidden_states

In [55]:
# Single prompt, layer 15
hidden_states = get_activations(prompts[:1],tokenizer,model,device,layer=15)
print(hidden_states.shape)

(1, 69, 1600)


In [56]:
# Single prompt, all layers. 
# Note that in this case the shape drops the singluar batch_size dimension. Maybe we should adjust this behavior. But our use case is probing, which is multiple prompts.
hidden_states = get_activations(prompts[:1],tokenizer,model,device)
print(hidden_states.shape)

(49, 69, 1600)


In [57]:
hidden_states = get_activations("military",tokenizer_rome,model_rome,device)
print(hidden_states[:, np.newaxis, :].shape)

(49, 1, 1600)


In [58]:
# Multiple prompt, layer 15
hidden_states = get_activations(prompts,tokenizer,model,device,layer=15)
print(hidden_states.shape)

(10, 69, 1600)


In [59]:
# Multiple prompt, all layers
hidden_states = get_activations(prompts,tokenizer,model,device)
print(hidden_states.shape)

(49, 10, 69, 1600)


### Train Some Probes

In [60]:
LABELS = "title" # choices are "title" and "categories"
DATA_FOLDER = "data/"
TRAIN_DATA = DATA_FOLDER + "train-10-articles.csv"
VAL_DATA = DATA_FOLDER + "val-10-articles.csv"
LAYER = -1

In [61]:
df_train = pd.read_csv(TRAIN_DATA)
df_val = pd.read_csv(VAL_DATA)

In [62]:
def get_fitted_label_encoder(df, labels=LABELS):
    if labels == "categories":
        from ast import literal_eval
        unique_labels = set()
        for item in df['categories'].tolist():
            categories = literal_eval(item)
            for cat in categories:
                unique_labels.update([cat])
        unique_labels = list(unique_labels)
    elif labels == "title":
        unique_labels = list(df['title'].drop_duplicates())
    
    label_encoder = LabelEncoder()
    label_encoder.fit(unique_labels)

    return label_encoder

In [63]:
label_encoder_title = get_fitted_label_encoder(df_train, labels="title")
label_encoder_title.classes_

array(['Agriculture', 'Alaska', 'Alchemy', 'Algae', 'Amphibian',
       'Anarchism', 'Animation', 'Anthropology', 'Appellate court',
       'Astronomer'], dtype='<U15')

In [64]:
label_encoder_cats = get_fitted_label_encoder(df_train, labels="categories")
label_encoder_cats.classes_

array(['1959 establishments in the United States',
       ':Anarchism by country', 'Agriculture', 'Agronomy', 'Alaska',
       'Alchemy', 'Algae', 'Amphibians', 'Amphibious organisms',
       'Anarchism', 'Animation', 'Anthropology', 'Anti-capitalism',
       'Anti-fascism', 'Appellate courts', 'Arctic Ocean',
       'Articles containing video clips', 'Astronomers', 'Astronomy',
       'Behavioural sciences', 'Beringia', 'Cartooning',
       'Common names of organisms', 'Courts by type',
       'Economic ideologies', 'Enclaves and exclaves',
       'Endosymbiotic events', 'Esotericism',
       'Exclaves in the United States',
       'Extant Late Devonian first appearances', 'Far-left politics',
       'Film and video technology', 'Food industry',
       'Former Russian colonies', 'Hermeticism', 'History of science',
       'Humans', 'Jurisdiction', 'Left-wing politics',
       'Libertarian socialism', 'Libertarianism', 'Natural philosophy',
       'Northern America', 'Political culture

In [65]:
class ActivationsDataset(Dataset):
    def __init__(self, Xs, title, categories, text):
        self.Xs = Xs
        self.title = title
        self.categories = categories
        self.text = text

    def __len__(self):
        return len(self.Xs)

    def __getitem__(self, idx):
        return self.Xs[idx], self.title[idx], self.categories[idx], self.text[idx]

In [66]:
def parse_categories(cat_list, label_encoder):
    encoded_categories = []
    for cat in cat_list:
        encoded_cat = label_encoder.transform(literal_eval(cat)).tolist()
        encoded_categories.append(encoded_cat)
    return encoded_categories

In [67]:
def df_to_dataset(df, model, tokenizer, label_encoder_title, label_encoder_cats, layer=-1, aggregation="max", labels=LABELS, save=False, filename=DATA_FOLDER + "dataset.pt"):
    # TODO: this is kinda ugly
    # title
    df['title_encoded'] = label_encoder_title.transform(df['title'])

    # categories
    df['label_encoded'] = parse_categories(df['categories'].tolist(), label_encoder_cats)
    def list_to_binary_vector(lst, dim=len(label_encoder_cats.classes_)):
        return [1 if i in lst else 0 for i in range(dim)]
    df['binary_labels'] = df['label_encoded'].apply(list_to_binary_vector)

    Xs = []
    titles = []
    categories = []
    text = []
    for i, row in df.iterrows():
        hidden_states = get_activations(row.text,tokenizer,model,device)
        if len(hidden_states.shape) == 2: # GPT-2 will lose a dimension if there's a single token
            hidden_states = hidden_states[:, np.newaxis, :]
        if aggregation == "max":
            x = np.max(hidden_states[layer,:,:], axis=0)
        elif aggregation == "mean":
            x = np.mean(hidden_states[layer,:,:], axis=0)
        Xs.append(x)
        titles.append(row.title_encoded)
        categories.append(row.binary_labels)
        text.append(row.text)
    
    Xs_t = Tensor(np.asarray(Xs)).float()
    titles_t = Tensor(np.asarray(titles)).long() # cross entropy loss wants a long dtype
    categories_t = Tensor(np.asarray(categories)).float() # binary cross entropy loss wants a float dtype

    return ActivationsDataset(Xs_t, titles_t, categories_t, text)

In [69]:
train_dataset = df_to_dataset(df_train, model, tokenizer, label_encoder_title, label_encoder_cats, aggregation="max", layer=LAYER)

In [70]:
train_loader = DataLoader(train_dataset, batch_size=4)

In [71]:
val_dataset = df_to_dataset(df_val, model, tokenizer, label_encoder_title, label_encoder_cats, aggregation="max", layer=LAYER)

In [72]:
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [73]:
class Linear(nn.Module):
    # TODO: figure out better way to handle the number of classes for default
    def __init__(self, hidden_size=model.config.hidden_size, n_classes=len(label_encoder_title.classes_)):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = self.fc1(x)
        return x

In [74]:
class MLP(nn.Module):
    def __init__(self, hidden_size=model.config.hidden_size, n_classes=len(label_encoder_title.classes_)):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, 120)
        self.fc2 = nn.Linear(120, n_classes)

    def forward(self, x):
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [75]:
def init_weights(m):
    if type(m) == nn.Linear:
        init.xavier_normal_(m.weight)
        init.constant_(m.bias, 0)

In [76]:
PROBE = "linear" # choices are "linear" or "mlp"
if PROBE == "linear":
    probe = Linear()
elif PROBE == "mlp":
    probe = MLP()
probe.apply(init_weights)

Linear(
  (fc1): Linear(in_features=1600, out_features=10, bias=True)
)

In [77]:
if LABELS == "categories":
    criterion = BCEWithLogitsLoss(pos_weight=Tensor(torch.ones(len(label_encoder_cats.classes_)) * 20))
elif LABELS == "title":
    criterion = nn.CrossEntropyLoss()

# TODO: what optimizer should we actually use?
optimizer = optim.AdamW(probe.parameters(), lr=0.001)
# optimizer = optim.SGD(probe.parameters(), lr=.001, momentum=.5)

In [78]:
EPOCHS = 200

In [79]:
predicted_labels = 0
for epoch in range(EPOCHS):  
    probe.train()
    train_loss = 0.0
    for i, data in enumerate(train_loader):
        # TODO: this is sloppy
        inputs, titles, categories, text = data
        labels = titles if LABELS == "title" else categories
        optimizer.zero_grad()
        outputs = probe(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

        if LABELS == "title":
            _, predicted_labels = torch.max(outputs, 1)
        elif LABELS == "categories":
            predicted_labels = (torch.sigmoid(outputs) > .5).int()
            total += labels.numel()

        # for txt, lbl, pred in zip(text, label_encoder_title.inverse_transform(labels), label_encoder_title.inverse_transform(predicted_labels)):
        #     print("text: ", txt)
        #     print("labels: ", lbl)
        #     print("predicted labels: ", pred)
        #     total += labels[0]

    if (epoch + 1) % 5 == 0:
        val_loss = 0.0
        total = 0
        correct = 0
        incorrect_examples = []
        probe.eval()
        with torch.no_grad():
            for i, data in enumerate(val_loader):  
                inputs, titles, categories, text = data
                labels = titles if LABELS == "title" else categories
                outputs = probe(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                if LABELS == "title":
                    _, predicted_labels = torch.max(outputs, 1)
                elif LABELS == "categories":
                    predicted_labels = (torch.sigmoid(outputs) > .5).int()
                    total += labels.numel()
                graded_preds = predicted_labels == labels
                # if (~graded_preds).sum() > 0:
                #     incorrect_examples.append(text)
                correct += (graded_preds).sum().item()
                # TODO: set this up to be the correct encoder depending on label
                if epoch + 1 == EPOCHS:
                    for txt, lbl, pred in zip(text, label_encoder_title.inverse_transform(labels), label_encoder_title.inverse_transform(predicted_labels)):
                        print("text: ", txt)
                        print("labels: ", lbl)
                        print("predicted labels: ", pred)
                total += labels.size()[0]

        print(f'[Training][{epoch + 1}] loss: {train_loss / len(train_loader):.3f}')
        print(f'[Validation][{epoch + 1}] loss: {val_loss / len(val_loader):.3f}') #TODO: I broke this somehow for title
        print(f'[Validation]{epoch + 1} accuracy: {correct / total:.3f}')
# print("Incorrect examples on last val step: ", incorrect_examples)

[Training][5] loss: 22.973
[Validation][5] loss: 43.199
[Validation]5 accuracy: 0.100
[Training][10] loss: 14.970
[Validation][10] loss: 34.264
[Validation]10 accuracy: 0.100
[Training][15] loss: 5.724
[Validation][15] loss: 18.820
[Validation]15 accuracy: 0.200
[Training][20] loss: 5.766
[Validation][20] loss: 25.841
[Validation]20 accuracy: 0.200
[Training][25] loss: 2.865
[Validation][25] loss: 25.293
[Validation]25 accuracy: 0.200
[Training][30] loss: 2.264
[Validation][30] loss: 14.895
[Validation]30 accuracy: 0.200
[Training][35] loss: 0.776
[Validation][35] loss: 7.405
[Validation]35 accuracy: 0.400
[Training][40] loss: 0.520
[Validation][40] loss: 8.717
[Validation]40 accuracy: 0.333
[Training][45] loss: 1.251
[Validation][45] loss: 12.271
[Validation]45 accuracy: 0.300
[Training][50] loss: 0.170
[Validation][50] loss: 2.603
[Validation]50 accuracy: 0.633
[Training][55] loss: 0.093
[Validation][55] loss: 7.425
[Validation]55 accuracy: 0.400
[Training][60] loss: 0.214
[Validatio

In [89]:
# TODO: hack this to get it working
import sys
sys.path.append('/home/tnief/rome')

from rome.experiments.py.demo import demo_model_editing

ModuleNotFoundError: No module named 'rome.experiments.py.demo'

In [87]:
import os

# Assuming '/home/tnief/rome' is the correct path
package_path = '/home/tnief/rome'
print("Contents of the package directory:", os.listdir(package_path))

# If 'experiments' is a directory within 'rome'
experiments_path = os.path.join(package_path, 'experiments')
print("Contents of 'experiments':", os.listdir(experiments_path))

# If 'py' is a directory within 'experiments'
py_path = os.path.join(experiments_path, 'py')
print("Contents of 'py':", os.listdir(py_path))

Contents of the package directory: ['hparams', 'experiments', 'globals.yml', 'README.md', 'dsets', '.gitattributes', 'LICENSE', 'baselines', '.gitignore', 'CITATION.cff', 'util', 'scripts', 'notebooks', '.git', 'rome']
Contents of 'experiments': ['summarize.py', 'py', 'evaluate.py', '__init__.py', 'causal_trace.py', 'sweep.py']
Contents of 'py': ['eval_utils_zsre.py', 'eval_utils_counterfact.py', 'demo.py']
