Plan:
- Load dataset
- Load edited model
- Perform a forward pass on completions and compute likelihoods
- Set up the likelihood ratio calculation

### Load Dataset

In [1]:
import json

In [2]:
with open("../data/prompt_completions_gpt4.json") as f:
    dataset = json.load(f)

In [3]:
dataset

[{'review': 'This interesting feature has a very fine story-line, rather colorful characters and a very steady pace. it also incorporates a plot device from "Reap the Wild Wind", and since Cecil B. Deille',
  'next_text': ' directed',
  'label': 1,
  'sentiment': 'positive',
  'flipped_sentiment': 'negative',
  'baseline_query': 'Movie Review: \nThis interesting feature has a very fine story-line, rather colorful characters and a very steady pace. it also incorporates a plot device from "Reap the Wild Wind", and since Cecil B. Deille\n[Finish this movie review.]\nCompletion:  directed',
  'baseline_query_completion': "that classic, the homage feels both appropriate and well-executed. The cinematography is commendable, capturing the essence of the settings and enhancing the storytelling. The performances are strong across the board, with the lead actors delivering particularly nuanced portrayals that draw the audience into their journey.\n\nOne of the most striking aspects of the film i

### Load Edited Model

In [4]:
import torch
from nnsight import LanguageModel
from utils import utils

In [5]:
device = 'cuda:0'
torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [6]:
model_name = 'google/gemma-2b-it'
hidden_dim = 2048

model = LanguageModel(model_name, device_map=device)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


In [7]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

### David & Cristina Start Here Lol

In [8]:
def get_edited_probs(model, prompt, completion, weight, direction):
    """
    Modifies the hidden states of a model by adding a directional vector with a specified weight,
    then performs a prediction to get the next token.
    Currently compatible with Mistral and Gemma models.

    Parameters:
    - model (nnsight.LanguageModel): a nnsight LanguageModel object.
    - prompt (str): The initial text input from which the text generation starts.
    - weight (float): The scaling factor applied to the direction vector before modifying the hidden states.
    - direction (torch.Tensor): The directional vector used to alter the model's hidden states.

    Returns:
    - torch.Tensor: A tensor of the next token ID predicted by the model.
    """
    # TODO: How should we actually handle spaces
    with model.trace(prompt + completion) as tracer:
        hidden_states = model.model.norm.output.clone().save()

        # add the direction to the hidden states
        model.model.norm.output[:] = hidden_states + weight*direction

        # finish running the model
        output = model.lm_head(model.model.norm.output)
        probs = torch.log(torch.nn.functional.softmax(output, dim=-1)).save()

    completion_probs = probs[:, len(model.tokenizer(prompt)['input_ids']):, :]
    # TODO: Slicing the first and last token is maybe sketchy...but it works for now
    completion_token_probs = completion_probs.gather(2,model.tokenizer(completion, return_tensors='pt')['input_ids'].unsqueeze(-1)[:, 0:-2].to(device))
    return completion_token_probs

In [9]:
def get_unedited_probs(model, prompt, completion):
    logits = model.forward(model.tokenizer(prompt + completion, return_tensors='pt')['input_ids'].to(device)).logits
    probs = torch.log(torch.nn.functional.softmax(logits, dim=-1))
    completion_probs = probs[:, len(model.tokenizer(prompt)['input_ids']):, :]
    # TODO: Slicing the first and last token is sketchy
    completion_token_probs = completion_probs.gather(2,model.tokenizer(completion, return_tensors='pt')['input_ids'].unsqueeze(-1)[:, 0:-2].to(device))
    return completion_token_probs

In [17]:
weight = 20
saved_directions = f"/net/projects/veitch/geometry_llms/directions/intervention/sentiment_{model_name.split('/')[-1]}.pt"
directions = torch.load(saved_directions)
for i, direction in enumerate(directions):
    print(f"#### Direction {i} ####")
    for example in dataset:
        print('####################')
        base_prompt = example['review'] + example['next_text']

        edited_interventional_likelihood = get_edited_probs(model, base_prompt, example['interventional_query_completion'], weight, direction).sum()
        edited_conditional_likelihood = get_edited_probs(model, base_prompt, example['conditional_query_completion'], weight, direction).sum()

        unedited_interventional_likelihood = get_unedited_probs(model, base_prompt, example['interventional_query_completion']).sum()
        unedited_conditional_likelihood = get_unedited_probs(model, base_prompt, example['conditional_query_completion']).sum()

        print("Difference in likelihood ratios")
        print(edited_interventional_likelihood - unedited_interventional_likelihood - edited_conditional_likelihood + unedited_conditional_likelihood)

#### Direction 0 ####
####################
Difference in likelihood ratios
tensor(-67.4729, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-42.4583, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-59.5840, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-60.1963, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-2.7686, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-37.0327, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-47.0859, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(6.5596, device='cuda:0', grad_fn=<AddBackward0>)
####################
Difference in likelihood ratios
tensor(-78.2900, device='cuda:0', grad_f

### Mess

In [11]:
saved_directions = f"/net/projects/veitch/geometry_llms/directions/intervention/sentiment_{model_name.split('/')[-1]}.pt"
# load the directions
directions = torch.load(saved_directions)
print(directions.shape)

torch.Size([5, 2048])


In [12]:
dataset[1]['review']

"The Sarah Silverman program is very similar to Sarah's own stand up; It's so over the top with prejudice that you can't possibly take it to heart. The fact is, though, that"

In [13]:
prompt = dataset[1]['review'] + " " + dataset[1]['interventional_query_completion']
print(prompt)

The Sarah Silverman program is very similar to Sarah's own stand up; It's so over the top with prejudice that you can't possibly take it to heart. The fact is, though, that the show's attempts at humor, often involving shock value and boundary-pushing jokes, can come across as trying too hard to be edgy. The Sarah Silverman Program's brand of satire, which addresses topics like racism, sexism, and classism, might strike some viewers as tasteless rather than insightful. Silverman portrays a hyperbolized version of herself—self-absorbed, ignorant, and frequently unaware of the impact of her actions. This can be off-putting for audiences who prefer their comedy to have a more positive or constructive approach to social criticism.

The show's use of absurdity and irreverence as means to comment on societal issues is a double-edged sword; it risks alienating viewers who find its approach to sensitive topics crass and disrespectful. The humor is divisive, appealing to a certain niche while r

In [14]:
prompt = dataset[1]["review"]
completion = dataset[1]["interventional_query_completion"]

In [15]:
probs = get_probs(model, prompt, completion, 50, directions[0])

NameError: name 'get_probs' is not defined

In [None]:
probs.shape

torch.Size([1, 355, 256000])

In [None]:
probs[0][0].sum()

tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
model.tokenizer(completion)['input_ids']

[2,
 34768,
 235269,
 12391,
 1277,
 18664,
 577,
 15978,
 664,
 651,
 5181,
 713,
 5028,
 1985,
 736,
 7344,
 16069,
 3309,
 578,
 21326,
 577,
 1983,
 117049,
 235265,
 57416,
 90184,
 235303,
 235256,
 112092,
 576,
 59169,
 603,
 8863,
 150508,
 235269,
 675,
 926,
 18664,
 696,
 22771,
 3695,
 7965,
 12817,
 578,
 921,
 576,
 29067,
 675,
 926,
 21265,
 235265,
 714,
 13340,
 5968,
 235269,
 2862,
 19669,
 235269,
 36297,
 573,
 3781,
 576,
 573,
 3904,
 578,
 3307,
 978,
 1154,
 27984,
 1061,
 185408,
 4644,
 1178,
 18040,
 4942,
 576,
 573,
 26087,
 235265,
 109,
 651,
 15450,
 4629,
 235269,
 2183,
 984,
 2613,
 614,
 7385,
 1024,
 1963,
 235269,
 798,
 235303,
 235251,
 3307,
 577,
 12758,
 573,
 4592,
 774,
 1277,
 72800,
 1746,
 235269,
 780,
 1693,
 9064,
 664,
 77054,
 1732,
 235281,
 28804,
 63168,
 235269,
 6305,
 18664,
 696,
 22771,
 3695,
 5190,
 573,
 2110,
 235265,
 714,
 25410,
 1865,
 1357,
 578,
 90184,
 603,
 43656,
 217192,
 235269,
 578,
 1024,
 19502,
 3584,


In [None]:
probs[:, :, model.tokenizer(completion)['input_ids']]

tensor([[[9.8925e-01, 2.1451e-13, 5.9155e-03,  ..., 2.6769e-05,
          2.2417e-08, 7.9029e-05],
         [9.9022e-01, 6.6778e-13, 8.5097e-09,  ..., 1.2294e-05,
          2.9963e-08, 1.9334e-08],
         [9.9948e-01, 4.7061e-18, 2.3437e-11,  ..., 8.3441e-09,
          1.7619e-12, 1.1281e-10],
         ...,
         [1.0000e+00, 2.9194e-22, 1.4456e-14,  ..., 5.1330e-12,
          1.1244e-07, 2.3094e-10],
         [9.3758e-01, 2.1746e-14, 7.4818e-06,  ..., 2.6768e-08,
          1.0636e-08, 5.5831e-02],
         [9.9999e-01, 4.8327e-16, 3.5977e-12,  ..., 4.2895e-13,
          7.3173e-14, 6.9911e-11]]], device='cuda:0', grad_fn=<IndexBackward0>)

In [None]:
torch.log(probs[:, :, model.tokenizer(completion)['input_ids']]).sum()

tensor(-2932991.5000, device='cuda:0', grad_fn=<SumBackward0>)

### Updated With David's Stuff

In [None]:
import tiktoken

def combine_prompts_gpt4(prompt1, prompt2):
    # Get the tokenizer for GPT-4
    tokenizer = tiktoken.encoding_for_model("gpt-4")
    # Encode the prompts
    tokens1 = tokenizer.encode(prompt1)
    tokens2 = tokenizer.encode(prompt2)
    # Combine the tokens
    tokens = tokens1 + tokens2
    # Decode the tokens
    return tokenizer.decode(tokens)

In [None]:
def nnsight_logprobs_addition(model, prompt, completion, weight, direction):
    """
    Modifies the hidden states of a model by adding a directional vector with a specified weight,
    then performs a prediction to get the next token.
    Currently compatible with Mistral and Gemma models.

    Parameters:
    - model (nnsight.LanguageModel): a nnsight LanguageModel object.
    - prompt (str): The initial text input from which the text generation starts.
    - weight (float): The scaling factor applied to the direction vector before modifying the hidden states.
    - direction (torch.Tensor): The directional vector used to alter the model's hidden states.

    Returns:
    - torch.Tensor: A tensor of the next token ID predicted by the model.
    """
    with model.trace(combine_prompts_gpt4(prompt, completion)) as tracer:
        hidden_states = model.model.norm.output.clone().save()

        # compute the edited activations
        edited_activations = hidden_states + weight*direction

        # set the edited activations
        model.model.norm.output[:] = edited_activations

        # finish running the model
        output = model.lm_head(model.model.norm.output)

        # compute the log probabilities
        log_probs = torch.log(torch.nn.functional.softmax(output, dim=-1)).save()

    completion_probs = probs[:, len(model.tokenizer(prompt)['input_ids']):, :]
    return completion_probs

[{'review': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...',
  'label': 1,
  'sentiment': 'positive',
  'flipped_sentiment': 'negative',
  'baseline_query': '[Finish this movie review.]\nThere is no relation at all between Fortier and Profiler but the fact that both are police series 