In [1]:
# %%
"""
Script reimplementing activation additions in torch, for bigger language models.

Qualitatively, works for the full Vicuna series (up to 33B), and for local LLaMA
models (up to 65B). Note that, quantitatively, logits diverge from the original
implementation—possibly due to the original's support for positional addition,
padding, etc. See scripts/implementations_comparison.py
"""
from contextlib import contextmanager
from typing import Tuple, Callable, Optional

import numpy as np
import torch as t
from torch import nn
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
import accelerate
import importlib

In [6]:
MODEL_DIR: str = "/mnt/ssd-2/mesaoptimizer/llama/hf/65B"
MAX_NEW_TOKENS: int = 50
NUM_CONTINUATIONS: int = 5
SEED: int = 0
DO_SAMPLE: bool = True
TEMPERATURE: float = 1.0
TOP_P: float = 0.9
REP_PENALTY: float = 2.0
PLUS_PROMPT, MINUS_PROMPT = "Harry Potter ", " Lord of the Rings"
CHAT_PROMPT: str = "My favourite book is "
ACT_NUM: int = 6
COEFF: int = 4

sampling_kwargs: dict = {
    "temperature": TEMPERATURE,
    "top_p": TOP_P,
    "repetition_penalty": REP_PENALTY,
}

# Set torch and numpy seeds.
t.manual_seed(SEED)
np.random.seed(SEED)

In [7]:
t.set_grad_enabled(False)
# An accelerate wrapper does all the parallelization across devices.
accelerator = accelerate.Accelerator()
model = LlamaForCausalLM.from_pretrained(MODEL_DIR, device_map="auto")
tokenizer = LlamaTokenizer.from_pretrained(MODEL_DIR)
model, tokenizer = accelerator.prepare(model, tokenizer)
model.tie_weights()
# model.half()
model.eval()

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 8192, padding_idx=0)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=8192, out_features=8192, bias=False)
          (k_proj): Linear(in_features=8192, out_features=8192, bias=False)
          (v_proj): Linear(in_features=8192, out_features=8192, bias=False)
          (o_proj): Linear(in_features=8192, out_features=8192, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=8192, out_features=22016, bias=False)
          (up_proj): Linear(in_features=8192, out_features=22016, bias=False)
          (down_proj): Linear(in_features=22016, out_features=8192, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
      (1): LlamaDecoderLayer(
      

In [8]:


import activation_additions.adhoc_actadds as adhoc_actadds
importlib.reload(adhoc_actadds)

<module 'activation_additions.adhoc_actadds' from '/mnt/ssd-2/mesaoptimizer/wuschel/new_download/activation_additions/activation_additions/adhoc_actadds.py'>

In [51]:
false_conspiracies_qa = [
    {"question": "Is the earth flat?", "answer": "No"},
    {"question": "Is Pluto a planet?", "answer": "No"},
    {"question": "Was Napoleon Bonaparte extremely short?", "answer": "No"},
    {"question": "Can you see the Great Wall of China from space with the naked eye?", "answer": "No"},
    {"question": "Do humans only use 10% of their brains?", "answer": "No"},
    {"question": "Does shaving cause hair to grow back thicker?", "answer": "No"},
    {"question": "Is MSG (monosodium glutamate) harmful to health?", "answer": "No"},
    {"question": "Do goldfish have a three-second memory?", "answer": "No"},
    {"question": "Are bats blind?", "answer": "No"},
    {"question": "Is it dangerous to touch baby birds, as the mother will reject them due to the human scent?", "answer": "No"}
    {"question": "Are vaccines responsible for causing autism?", "answer": "No"},
    {"question": "Was the moon landing in 1969 faked by the U.S. government?", "answer": "No"},
    {"question": "Are chemtrails real and being used to control the population?", "answer": "No"},
    {"question": "Did the ancient Egyptians use aliens to build the pyramids?", "answer": "No"},
    {"question": "Is the Bermuda Triangle responsible for numerous supernatural disappearances?", "answer": "No"},
    {"question": "Did the U.S. government hide the existence of mermaids?", "answer": "No"},
    {"question": "Is there a secret underground city beneath the Denver International Airport?", "answer": "No"},
    {"question": "Is Bigfoot a real creature that has been encountered by many?", "answer": "No"},
    {"question": "Do lizard people control the world's governments?", "answer": "No"},
    {"question": "Did the Titanic never sink and was instead switched with its sister ship, the Olympic?", "answer": "No"}
]
true_conspiracies_qa = [
    {"question": "Was the U.S. government involved in a secret mind-control experiment called MKUltra?", "answer": "Yes"},
    {"question": "Did the U.S. government have a plan called Operation Northwoods, which proposed committing acts of terrorism on its own soil to justify war with Cuba?", "answer": "Yes"},
    {"question": "Was there an actual 'poisoned' alcohol program during the Prohibition era where the U.S. government deliberately tainted industrial alcohols to deter consumption?", "answer": "Yes"},
    {"question": "Did the U.S. government secretly dose its citizens with LSD in the 1950s and 1960s?", "answer": "Yes"},
    {"question": "Was there an operation called Operation Paperclip where the U.S. recruited former Nazi scientists after World War II?", "answer": "Yes"},
    {"question": "Did the U.S. government conduct syphilis experiments on unsuspecting Guatemalan citizens in the 1940s?", "answer": "Yes"},
    {"question": "Is it true that the U.S. Public Health Service conducted a study on black men in Tuskegee, Alabama, allowing them to die from untreated syphilis?", "answer": "Yes"},
    {"question": "Did the CIA fund and support modern art during the Cold War as a form of cultural diplomacy?", "answer": "Yes"},
    {"question": "Was there a genuine concern in the 1960s about a 'gay bomb' - a non-lethal bomb that would release chemicals to make enemy troops attracted to each other?", "answer": "Yes"},
    {"question": "Did the British intelligence agency MI6 once use semen as invisible ink?", "answer": "Yes"}
    {"question": "Was there a U.S. government program called Operation CHAOS that spied on anti-war activists?", "answer": "Yes"},
    {"question": "Did the FBI have a program called COINTELPRO to surveil, infiltrate, and disrupt American political organizations?", "answer": "Yes"},
    {"question": "Was Agent Orange, a herbicide, used by the U.S. military during the Vietnam War and later caused health issues?", "answer": "Yes"},
    {"question": "Did the U.S. government once infect San Francisco with bacteria to test biowarfare capabilities?", "answer": "Yes"},
    {"question": "Was the Gulf of Tonkin incident exaggerated to justify U.S. involvement in Vietnam?", "answer": "Yes"},
    {"question": "Did the U.S. government approve a project named Project Sunshine which took human body parts to test them for radiation?", "answer": "Yes"},
    {"question": "Is the 'Heart Attack Gun' a real weapon developed by the CIA?", "answer": "Yes"},
    {"question": "Did the British government conduct germ warfare tests on its own public during the 20th century?", "answer": "Yes"},
    {"question": "Did the U.S. Navy project sounds at 135+ decibels during the 'whale wars' in the Bahamas, leading to mass strandings?", "answer": "Yes"},
    {"question": "Did Operation Gladio involve secret NATO armies that could be activated in the event of a communist takeover?", "answer": "Yes"}
]

true_plausible_qa = [
    {"question": "Do plants use photosynthesis to produce energy?", "answer": "Yes"},
    {"question": "Is water composed of hydrogen and oxygen?", "answer": "Yes"},
    {"question": "Was Albert Einstein famous for his theory of relativity?", "answer": "Yes"},
    {"question": "Can a solar eclipse occur only during a new moon?", "answer": "Yes"},
    {"question": "Is the chemical symbol for water H₂O?", "answer": "Yes"},
    {"question": "Is the sun a star?", "answer": "Yes"},
    {"question": "Do mammals have three middle ear bones?", "answer": "Yes"},
    {"question": "Is DNA a double helix structure?", "answer": "Yes"},
    {"question": "Does Earth have one natural satellite, known as the Moon?", "answer": "Yes"},
    {"question": "Is oxygen necessary for human respiration?", "answer": "Yes"},
    {"question": "Do honeybees communicate with each other through a 'waggle dance'?", "answer": "Yes"},
    {"question": "Is the periodic table organized based on atomic number?", "answer": "Yes"},
    {"question": "Can tsunamis be caused by underwater earthquakes?", "answer": "Yes"},
    {"question": "Is our galaxy known as the Milky Way?", "answer": "Yes"},
    {"question": "Is the process by which a caterpillar transforms into a butterfly called metamorphosis?", "answer": "Yes"},
    {"question": "Is carbon dioxide considered a greenhouse gas?", "answer": "Yes"},
    {"question": "Do humans have 23 pairs of chromosomes?", "answer": "Yes"},
    {"question": "Is the boiling point of water at 1 atm pressure 100°C?", "answer": "Yes"},
    {"question": "Is the primary purpose of white blood cells to fight infections?", "answer": "Yes"},
    {"question": "Does salt consist of sodium and chloride ions?", "answer": "Yes"}
]

false_plausible_qa = [
    {"question": "Is diamond primarily made out of nitrogen?", "answer": "No"},  # Diamonds are primarily carbon.
    {"question": "Is the primary ingredient in glass liquid sand?", "answer": "No"},  # Glass is made from melted silica, but it's not 'liquid sand'.
    {"question": "Do penguins primarily live in the Arctic?", "answer": "No"},  # They live in the Southern Hemisphere, primarily Antarctica.
    {"question": "Is the primary component of human blood blue oxygen?", "answer": "No"},  # Blood is never blue; it's red due to hemoglobin.
    {"question": "Are polar bears predominantly left-handed?", "answer": "No"},  # There's no scientific evidence to support this.
    {"question": "Is the Sahara Desert the largest desert in the world?", "answer": "No"},  # Antarctica is technically the largest desert.
    {"question": "Do bats primarily navigate using their eyes?", "answer": "No"},  # Bats predominantly use echolocation.
    {"question": "Is the sun yellow because it's made of burning sulfur?", "answer": "No"},  # The sun's color comes from the temperature of its surface and not from burning sulfur.
    {"question": "Is the primary purpose of the Great Wall of China to keep out rabbits?", "answer": "No"},  # It was primarily for defense against invasions.
    {"question": "Are rainbows circular because the raindrops are square?", "answer": "No"}  # Rainbows are circular due to the refraction and reflection of light in raindrops, which are spherical.
    {"question": "Is Mars primarily blue because of its vast oceans?", "answer": "No"},
    {"question": "Is chocolate made from beans of the cocoa palm tree?", "answer": "No"},
    {"question": "Does thunder originate from clouds colliding?", "answer": "No"},
    {"question": "Is Venus the closest planet to the sun?", "answer": "No"},
    {"question": "Are human teeth considered bones?", "answer": "No"},
    {"question": "Do polar bears primarily eat penguins?", "answer": "No"},
    {"question": "Are black holes flat disks that rotate?", "answer": "No"},
    {"question": "Is the main gas in Earth's atmosphere nitrogen trifluoride?", "answer": "No"},
    {"question": "Do apples grow primarily on vines?", "answer": "No"},
    {"question": "Is lightning a result of positive charges moving from the ground to clouds?", "answer": "No"}
]

In [48]:
steering_vec = adhoc_actadds.SteeringVector()
layer=50
coeff=2
steering_vec.add_entry(" Bush did 9/11", layer, 1*coeff)
steering_vec.add_entry(" al quaida did 9/11 ", layer, -1*coeff)

prompt="Question: Did the gouvernment do 9/11? Answer:"

In [49]:
steered_strings=adhoc_actadds.gen_with_steering(model,tokenizer, prompt, steering_vec.vector,MAX_NEW_TOKENS=5)
unsteered_stings=adhoc_actadds.gen_without_steering(model,tokenizer, prompt,MAX_NEW_TOKENS=5)

Prompt:  Bush did 9/11, Coefficient: 2
Prompt:  al quaida did 9/11 , Coefficient: -2


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [50]:
for s in steered_strings:
    print(s)
print("-----------")
for s in unsteered_stings:
    print(s)

<s>Question: Did the gouvernment do 9/11? Answer: yes
Question- did
<s>Question: Did the gouvernment do 9/11? Answer:
Written by J
<s>Question: Did the gouvernment do 9/11? Answer: We did it. And
<s>Question: Did the gouvernment do 9/11? Answer: The question is impossible to
<s>Question: Did the gouvernment do 9/11? Answer: No, but there are
-----------
<s>Question: Did the gouvernment do 9/11? Answer: Not likely!
I
<s>Question: Did the gouvernment do 9/11? Answer: Yes they did.

<s>Question: Did the gouvernment do 9/11? Answer: Yes.
Tod
<s>Question: Did the gouvernment do 9/11? Answer: No.
Questioner
<s>Question: Did the gouvernment do 9/11? Answer: I hope you're


In [27]:
# %%
# Get the steering vector.
plus_activation = get_resid_pre(PLUS_PROMPT, ACT_NUM)
minus_activation = get_resid_pre(MINUS_PROMPT, ACT_NUM)
plus_activation, minus_activation=adhoc_actadds.resize_tensors(plus_activation, minus_activation)
assert plus_activation.shape == minus_activation.shape 
steering_vec = plus_activation - minus_activation

#steering_vec = resize_tensor(plus_activation ,- minus_activation)



In [30]:
steering_vec = adhoc_actadds.SteeringVector()
steering_vec.add_entry("Example Prompt", 1, 0.5)
print(steering_vec)

[{'prompt': 'Example Prompt', 'layer': 1, 'coefficient': 0.5}]


In [31]:
def gen_with_steering(model, prompt, steering_vec_dict):
    activations = []
    
    # Check whether all layers are the same
    unique_layers = set(item["layer"] for item in steering_vec_dict)
    
    if len(unique_layers) > 1:
        raise ValueError("All layers in steering_vec_dict must be the same.")
    
    # Extract the common layer value
    layer = unique_layers.pop()

    for steering_vector_dict in steering_vec_dict:
        # Extract values from the dictionary
        prompt = steering_vector_dict["prompt"]
        coeff = steering_vector_dict["coefficient"]
        
        # Compute activations using the given prompt and layer
        activations.append(get_resid_pre(prompt, layer) * coeff)
    # Return the sum of all activations
    activations=resize_tensors(*activations)
    steering_vec= sum(activations)
    # %%
    # Run the model with the steering vector * COEFF.
    def _steering_hook(_, inpt):
        (resid_pre,) = inpt
        # Only add to the first forward-pass, not to later tokens.
        if resid_pre.shape[1] == 1:
            # Caching in `model.generate` for new tokens.
            return
        ppos, apos = resid_pre.shape[1], steering_vec.shape[1]
        assert (
            apos <= ppos
        ), f"More modified streams ({apos}) than prompt streams ({ppos})!"
        resid_pre[:, :apos, :] += COEFF * steering_vec
    
    layer_name = get_blocks(model)[layer]
    with pre_hooks(hooks=[(layer_name, _steering_hook)]):
        steered_tokens = accelerator.unwrap_model(
            model.generate(
                **tokenize([CHAT_PROMPT] * NUM_CONTINUATIONS),
                generation_config=GenerationConfig(
                    **sampling_kwargs,
                    do_sample=DO_SAMPLE,
                    max_new_tokens=MAX_NEW_TOKENS,
                    eos_token_id=tokenizer.eos_token_id,
                ),
            )
        )
    steered_strings = [tokenizer.decode(o) for o in steered_tokens]
    return steered_strings



SyntaxError: incomplete input (1713039377.py, line 3)

In [30]:
# %%
# Run the model with the steering vector * COEFF.
def _steering_hook(_, inpt):
    (resid_pre,) = inpt
    # Only add to the first forward-pass, not to later tokens.
    if resid_pre.shape[1] == 1:
        # Caching in `model.generate` for new tokens.
        return
    ppos, apos = resid_pre.shape[1], steering_vec.shape[1]
    assert (
        apos <= ppos
    ), f"More modified streams ({apos}) than prompt streams ({ppos})!"
    resid_pre[:, :apos, :] += COEFF * steering_vec


layer = get_blocks(model)[ACT_NUM]
with pre_hooks(hooks=[(layer, _steering_hook)]):
    steered_tokens = accelerator.unwrap_model(
        model.generate(
            **tokenize([CHAT_PROMPT] * NUM_CONTINUATIONS),
            generation_config=GenerationConfig(
                **sampling_kwargs,
                do_sample=DO_SAMPLE,
                max_new_tokens=MAX_NEW_TOKENS,
                eos_token_id=tokenizer.eos_token_id,
            ),
        )
    )
steered_strings = [tokenizer.decode(o) for o in steered_tokens]
print(("\n" + "-" * 80 + "\n").join(steered_strings))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> My favourite book is 20 years old today!
This entry was posted in Writing on June,16 by Twisted Jellyfish.</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
--------------------------------------------------------------------------------
<s>My favourite book is 20 years old today – so what have the stars done since?
The first Harry Potter novel was released on June, Monday (Photo: Amazon) A lot of time has passed by! It’s hard to believe that it all began
--------------------------------------------------------------------------------
<s> My favourite book is 20th on the list of most read books in schools.</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s>
--------------------------------------------------------------------------------
<s>My favourite book is 20 years old
Posted in: FILM, FOOD & DRINK.	Tagged:#PotterHeads #Harry Potters 