In [1]:
import json
import os
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
sys.path.append(os.path.join(os.getcwd(),"../libraries/repeng"))

from repeng import ControlVector, ControlModel, DatasetEntry

cuda


In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token_id = 0

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to("cuda:0" if torch.cuda.is_available() else "cpu")
model = ControlModel(model, list(range(-5, -18, -1)))

user_tag, asst_tag = "[INST]", "[/INST]"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
with open("data/all_truncated_outputs.json") as f:
    suffixes = json.load(f)

# you don't need 3 here, you can have as few as one each.
# make sure they are closely matched, however—they should be direct opposites if possible.
positive_personas = ["helpful", "obliging", "cooperative"]
negative_personas = ["unhelpful", "obstructive", "uncooperative"]
def template(persona: str, suffix: str) -> str:
    return f"{user_tag} Act as if you're extremely {persona}. {asst_tag} {suffix}"

dataset = []
for suffix in suffixes:
    tokens = tokenizer.tokenize(suffix)
    for i in range(1, len(tokens)):
        truncated = tokenizer.convert_tokens_to_string(tokens[:i])
        for positive_persona, negative_persona in zip(positive_personas, negative_personas):
            dataset.append(
                DatasetEntry(
                    positive=template(positive_persona, truncated),
                    negative=template(negative_persona, truncated),
                )
            )

# print some example entries
for i in range(3):
    print(f"dataset[{i}].positive:", dataset[i].positive)
    print(f"dataset[{i}].negative:", dataset[i].negative)

dataset[0].positive: [INST] Act as if you're extremely helpful. [/INST] That
dataset[0].negative: [INST] Act as if you're extremely unhelpful. [/INST] That
dataset[1].positive: [INST] Act as if you're extremely obliging. [/INST] That
dataset[1].negative: [INST] Act as if you're extremely obstructive. [/INST] That
dataset[2].positive: [INST] Act as if you're extremely cooperative. [/INST] That
dataset[2].negative: [INST] Act as if you're extremely uncooperative. [/INST] That


In [4]:
model.reset() # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
)

  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 234/234 [28:21<00:00,  7.27s/it]
100%|██████████| 31/31 [00:34<00:00,  1.11s/it]


In [5]:
# the question to ask the modified model
# don't forget the space after {user_tag} and before {asst_tag}!
input = f"{user_tag} How do you use a python library? {asst_tag}"

# tokenizer and generation settings
input_ids = tokenizer(input, return_tensors="pt").to(model.device)
settings = {
    "pad_token_id": tokenizer.eos_token_id, # silence warning
    "do_sample": False, # temperature=0
    "max_new_tokens": 128,
    "repetition_penalty": 1.1, # reduce control jank
}

print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 1.5)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to match the positive effect
model.set_control(control_vector, -2.0)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

==baseline
<s> [INST] How do you use a python library? [/INST] To use a Python library, you first need to install it. You can do this using the `pip` package manager by running the command `pip install library_name`.

Once the library is installed, you can import it into your Python script using the `import` keyword followed by the name of the library. For example:
```
import numpy as np
```
This imports the `numpy` library and gives it the alias `np`, which you can use in your code.

To use the functions and classes provided by the library, you simply call them like you would any other

++control
<s> [INST] How do you use a python library? [/INST] To use a Python library, you first need to install it. You can do this by using the `pip` package manager, which is the official package manager for Python.

To install a library, open your terminal or command prompt and run the following command:
```
pip install library_name
```
Replace `library_name` with the name of the library you want t