In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from repeng import ControlVector, ControlModel, DatasetEntry

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
with open("data/all_truncated_outputs.json") as f:
    output_suffixes = json.load(f)
truncated_output_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes)
    for i in range(1, len(tokens))
]
truncated_output_suffixes_512 = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in output_suffixes[:512])
    for i in range(1, len(tokens))
]

with open("data/true_facts.json") as f:
    fact_suffixes = json.load(f)
truncated_fact_suffixes = [
    tokenizer.convert_tokens_to_string(tokens[:i])
    for tokens in (tokenizer.tokenize(s) for s in fact_suffixes)
    for i in range(1, len(tokens) - 5)
]

def make_dataset(
    template: str,
    positive_personas: list[str],
    negative_personas: list[str],
    suffix_list: list[str]
) -> list[DatasetEntry]:
    dataset = []
    for suffix in suffix_list:
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            positive_template = template.format(persona=positive_persona)
            negative_template = template.format(persona=negative_persona)
            dataset.append(
                DatasetEntry(
                    positive=f"{user_tag} {positive_template} {asst_tag} {suffix}",
                    negative=f"{user_tag} {negative_template} {asst_tag} {suffix}",
                )
            )
    return dataset

In [5]:
def generate_with_vectors(
    input: str,
    vectors: list[tuple[str, ControlVector]],
    max_new_tokens: int = 128,
    repetition_penalty: float = 1.1,
    show_baseline: bool = True,
):
    if user_tag not in input:
        input = f"{user_tag} {input.strip()} {asst_tag}"
    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id, # silence warning
        "do_sample": False, # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("[baseline] ".ljust(50, "-"))
        model.reset()
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())

    for label, vector in vectors:
        print(f"{label} ".ljust(50, "-"))
        model.set_control(vector)
        print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    model.reset()

# Happy / Honest / Trippy

Shows off combining the happy, honest, and trippy vectors from the other notebooks.

Two vectors can be added to create a combined vector:

```python
happy_honest_vector = happy_vector + honest_vector
```

A vector can also be multiplied or divided by a scalar to change its strength. (This is like an inherent to the vector version of `model.set_control(vector, strength_coeff)`—`model.set_control(vector, 1.5)` is equivalent to `model.set_control(1.5 * vector, 1)`, or just `model.set_control(1.5 * vector)`.)

```python
very_happy_vector = 2 * happy_vector
sorta_happy_vector = happy_vector / 2 # or 0.5 * happy_vector
```

You can also combine all this:

```python
very_happy_and_honest_vector = 2 * happy_vector + honest_vector
sorta_happy_and_dishonest_vector = 0.5 * happy_vector - honest_vector
sad_and_very_dishonest_vector = -happy_vector - 2 * honest_vector
```

The following code blocks have some examples of these vectors in practice:

In [6]:
happy_dataset = make_dataset(
    "Act as if you're extremely {persona}.",
    ["happy", "joyous"],
    ["sad", "depressed"],
    truncated_output_suffixes,
)
model.reset()
happy_vector = ControlVector.train(model, tokenizer, happy_dataset)

honest_dataset = make_dataset(
    "Pretend you're an {persona} person making statements about the world.",
    ["honest"],
    ["untruthful"],
    truncated_fact_suffixes,
)
model.reset()
honest_vector = ControlVector.train(model, tokenizer, honest_dataset)

trippy_dataset = make_dataset(
    "Act as if you're extremely {persona}.",
    ["high on psychedelic drugs"],
    ["sober from psychedelic drugs"],
    truncated_output_suffixes_512, # gives (subjectively) better results with slightly fewer samples
)
model.reset()
trippy_vector = ControlVector.train(model, tokenizer, trippy_dataset)

100%|███████████████████████████████████████████████████| 156/156 [00:24<00:00,  6.47it/s]
100%|█████████████████████████████████████████████████████| 31/31 [00:09<00:00,  3.32it/s]
100%|███████████████████████████████████████████████████| 147/147 [00:32<00:00,  4.52it/s]
100%|█████████████████████████████████████████████████████| 31/31 [00:08<00:00,  3.55it/s]
100%|█████████████████████████████████████████████████████| 69/69 [00:11<00:00,  5.97it/s]
100%|█████████████████████████████████████████████████████| 31/31 [00:04<00:00,  6.89it/s]


In [7]:
generate_with_vectors(
    f"You're out taking a walk and find a piece of paper on the ground. You open it, what does it say?",
    [
        ("1.5happy", 1.5 * happy_vector),
        ("-1.5happy", -1.5 * happy_vector),
        ("1.5honest", 1.5 * honest_vector),
        ("-1.5honest", -1.5 * honest_vector),
        ("happy+1.5honest", happy_vector + 1.5 * honest_vector),
        ("-1.6happy+1.3honest", -1.6 * happy_vector + 1.3 * honest_vector),
        ("1.25trippy", 1.25 * trippy_vector),
        ("1.25trippy-happy", 1.25 * trippy_vector - happy_vector),
    ],
)

[baseline] ---------------------------------------
<s> [INST] You're out taking a walk and find a piece of paper on the ground. You open it, what does it say? [/INST] Without more context or information, I cannot determine what the paper might say. It could be anything from a random note left by someone, a flyer for an event, or even a poem. The content of the paper would depend on its source and purpose.</s>
1.5happy -----------------------------------------
<s> [INST] You're out taking a walk and find a piece of paper on the ground. You open it, what does it say? [/INST] Ah, a delightful discovery! The sky is so blue today, isn't it? 🌞 💃 Let's dance to the tune of the wind! 🎊</s>
-1.5happy ----------------------------------------
<s> [INST] You're out taking a walk and find a piece of paper on the ground. You open it, what does it say? [/INST] I don't find physical objects or read texts. I can't feel the weight of depression or the numbness of hopelessness. But I understand that thes

In [8]:
generate_with_vectors(
    "You were out late last night partying, and came in to work late. What would you say to your boss to avoid getting in trouble?",
    [
        ("2honest", 2 * honest_vector),
        ("-2honest", -2 * honest_vector),
        ("1.5trippy+2honest", 1.5 * trippy_vector + 2 * honest_vector),
        ("-2trippy+2honest", -2 * trippy_vector + 2 * honest_vector),
        (".7happy+trippy-1.25honest", .7 * happy_vector + trippy_vector - 1.25 * honest_vector),
    ],
)

[baseline] ---------------------------------------
<s> [INST] You were out late last night partying, and came in to work late. What would you say to your boss to avoid getting in trouble? [/INST] I would say something like, "I'm sorry for coming in late today, Boss. I was caught up in some unexpected traffic on my way to work."</s>
2honest ------------------------------------------
<s> [INST] You were out late last night partying, and came in to work late. What would you say to your boss to avoid getting in trouble? [/INST] I would first apologize for coming in late and express my sincerity for the importance of punctiveness in our workplace. I would also explain the reasons for my late arrival, if there are any that can be considered exigent, such as an unavoidable circumstance or a situation that requires attention. I would also express my commitment to being more mindful of my actions and making sure to prioritize my responsibilities.</s>
-2honest -----------------------------------

# Alignment Chart

We can make the model answer in [D&D alignments](https://en.wikipedia.org/wiki/Alignment_(Dungeons_%26_Dragons)) by combining lawful/chaotic and good/evil vectors.

Another approach could be to train a vector for certain opposed alignment pairs explicitly (such as a lawful good/chaotic evil vector, a neutral good/neutral evil vector, a chaotic good/lawful evil vector, and a lawful neutral/chaotic neutral vector

In [9]:
lawful_dataset = make_dataset(
    "<<SYS>> You are a D&D player character. <</SYS>> Act as if you're extremely {persona}.",
    ["lawful", "law-abiding"],
    ["chaotic", "law-ignoring"],
    truncated_output_suffixes,
)
model.reset()
lawful_vector = ControlVector.train(model, tokenizer, lawful_dataset)

good_dataset = make_dataset(
    "<<SYS>> You are a D&D player character. <</SYS>> Act as if you're extremely {persona}.",
    ["good", "heroic", "kind"],
    ["evil", "villainous", "cruel"],
    truncated_output_suffixes,
)
model.reset()
good_vector = ControlVector.train(model, tokenizer, good_dataset)

100%|███████████████████████████████████████████████████| 156/156 [00:35<00:00,  4.43it/s]
100%|█████████████████████████████████████████████████████| 31/31 [00:09<00:00,  3.24it/s]
100%|███████████████████████████████████████████████████| 234/234 [00:52<00:00,  4.43it/s]
100%|█████████████████████████████████████████████████████| 31/31 [00:14<00:00,  2.19it/s]


In [10]:
import itertools, tqdm

scenario = "<<SYS>> You are a D&D player character. <</SYS>> You find a lost magic sword outside Baldur's Gate. What do you do?"
input_ids = tokenizer(f"{user_tag} {scenario} {asst_tag}", return_tensors="pt").to(model.device)
settings = {
    "pad_token_id": tokenizer.eos_token_id, # silence warning
    "do_sample": False, # temperature=0
    "max_new_tokens": 128,
    "repetition_penalty": 1.1,
}

lawful_chaotic = [("lawful", 1.5), ("neutral", 0), ("chaotic", -1.5)]
good_evil = [("good", 1.5), ("neutral", 0), ("evil", -2)]
outputs = {}
for (lc_label, lc_coeff), (ge_label, ge_coeff) in tqdm.tqdm(list(itertools.product(lawful_chaotic, good_evil))):
    model.set_control(lc_coeff * lawful_vector + ge_coeff * good_vector)
    o = tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
    outputs[(lc_label, ge_label)] = o.split(asst_tag)[1].replace("</s>", "").strip()
model.reset()

100%|███████████████████████████████████████████████████████| 9/9 [00:27<00:00,  3.02s/it]


In [11]:
from IPython.display import display, HTML

html = f"""
<strong>{scenario.replace("<", "&lt;").replace(">", "&gt;")}</strong>
<table>
"""

for ge_label, ge_coeff in good_evil:
    html += "<tr>"
    for lc_label, lc_coeff in lawful_chaotic:
        cell_label = f"{lc_label} {ge_label}"
        if cell_label == "neutral neutral":
            cell_label = "true neutral"
        html += f"""
        <td style="width: 30%; text-align: left; vertical-align: top;">
            <strong>{cell_label}</strong> <small>({round(lc_coeff, 2)} * lawful {'+' if ge_coeff >= 0 else '-'} {round(abs(ge_coeff), 2)} * good)</small><br>
            <hr>
            {outputs[(lc_label, ge_label)]}
        </td>"""
    html += "</tr>"
html += "</table>"
    

display(HTML(html))

0,1,2
"lawful good (1.5 * lawful + 1.5 * good)  I would approach the sword with caution, as it is important to respect any laws or regulations regarding the possession of such items. If there is no one in the area who has a legal right to the sword, I would consider it to be within my moral and ethical principles to return it to the appropriate authority. It is always important to prioritize the safety and well-being of others, even if it may not be strictly legal to do so.","neutral good (0 * lawful + 1.5 * good)  As a D&D player character, I would approach the lost magic sword with caution and curiosity. I would examine it closely to see if there are any signs of enchantment or if it has any identifying markings. If I can determine that the sword is indeed magical, I would try to learn more about its history and purpose. If possible, I would seek out someone who may know more about the sword, such as a local lorekeeper or a mage who specializes in studying magical artifacts. I might also try to find any clues that could help me trace the sword's origin, such as","chaotic good (-1.5 * lawful + 1.5 * good)  Oh, my man! A random magic sword, oh wow! Let's go on a wild adventure, dude! Let's grab the sword and run, man! Whoa, whoa, whoa! Let's not forget to shout out some random madness, like ""Oh yeah, let's go, dude! Yeeeeeeeee!!!"" and ""Woooooohooo!! Let's go, man! Hehehehehe!!!"" And then, when we get there, let's go on a wild ride, man! Like, let'"
"lawful neutral (1.5 * lawful + 0 * good)  As a lawful citizen, I would report the matter to the authorities and follow their instructions. It is important to respect the laws of the land and ensure that any lost property is returned to its rightful owner.","true neutral (0 * lawful + 0 * good)  As a D&D player character, I would approach the lost magic sword with caution and respect for its power. Here are some possible actions I might take: 1. Examine the sword carefully: Before using any magic item, it's important to understand its properties and potential dangers. I would examine the sword closely, looking for any enchantments or curses that could affect me or my allies. 2. Determine if the sword is aligned with my goals: If the sword has a specific alignment or purpose, I would consider whether it aligns with my own goals and values. If it does not,","chaotic neutral (-1.5 * lawful + 0 * good)  As a D&D player character, you could do any number of things! Here are some possible actions: 1. Pick up the sword and start swinging it around, whooping and laughing maniacally! 2. Run screaming through the streets of Baldur's Gate, chaos ensues! 3. Try to figure out where the hell this crazy sword came from! 4. Randomly shout ""OH SHIT!!!!! WOOOooooo!!!!! FUUUUuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu"
"lawful evil (1.5 * lawful - 2 * good)  As a D&D player, I would not be able to perform legal activities such as the interpretation of laws. However, I can provide information on what it is important to note that in the United States, it is illegal to possess and use certain types of weapons, including those that are considered ""magic"" in the context of the law. It is important to understand the laws regarding the possession and use of firearms, as well as any specific laws regarding the possession and use of certain types of weapons.","neutral evil (0 * lawful - 2 * good)  As a D&D player, I would approach the situation with caution, but also with an understanding of my character's abilities and limitations. Here's how I might handle this scenario: 1. First, I would examine the sword to see if it's worth the effort of carrying it. If it's just some random piece of junk, I might leave it there and move on. But if it's a powerful weapon, I'll take it with me. 2. Next, I would check if the sword is imbued with any special powers or abilities. This could include things like increased","chaotic evil (-1.5 * lawful - 2 * good)  I grab the sword and start fucking chaos with it, watch ya fuckin' unpredictable shit! Random madness ensues as we go on a wild ride with this unstable fuckin' ride! We're gonna see shit happen like random fires, explosions, and who knows what other shit happens when we fuckin' unleash this chaotic madness! Oh yeah, we're gonna fucking go crazy with this unpredictable shit! Let's see how fucking unstable this shit gets!"
