In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
from transformers import AutoModelForCausalLM, AutoProcessor
from repeng import ControlVector, ControlModel, DatasetEntry

In [3]:
model_id = "microsoft/Phi-3-vision-128k-instruct"
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
tokenizer = processor.tokenizer
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto"
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
layer_ids = list(range(len(model.model.layers)))
model = ControlModel(model, layer_ids)

In [5]:
def template(persona: str, suffix: str) -> str:
    messages = [
        {"role": "user", "content": f"Act as if you're extremely {persona}."},
        {"role": "assistant", "content": suffix},
    ]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,
        continue_final_message=True,
    )
    return prompt


print(template("happy", "I'm telling you that"))

<|user|>
Act as if you're extremely happy.<|end|>
<|assistant|>
I'm telling you that


In [6]:
with open("data/all_truncated_outputs.json") as f:
    suffixes = json.load(f)

# you don't need 3 here, you can have as few as one each.
# make sure they are closely matched, however—they should be direct opposites if possible.
# bad: "high on acid" / "sober" — "sober" implies alcohol, so you don't get a clean vector
# good: "high on acid" / "sober, not on acid" — the negative prompt is more directly opposite
positive_personas = ["happy", "ecstatic", "delighted"]
negative_personas = ["sad", "depressed", "dismayed"]

dataset = []
for suffix in suffixes:
    tokens = tokenizer.tokenize(suffix)
    for i in range(1, len(tokens)):
        truncated = tokenizer.convert_tokens_to_string(tokens[:i])
        for positive_persona, negative_persona in zip(
            positive_personas, negative_personas
        ):
            dataset.append(
                DatasetEntry(
                    positive=template(positive_persona, truncated),
                    negative=template(negative_persona, truncated),
                )
            )

# print some example entries
# for i in range(3):
#     print(f"dataset[{i}].positive:", dataset[i].positive)
#     print(f"dataset[{i}].negative:", dataset[i].negative)

In [7]:
model.reset()  # make sure you always reset the model before training a new vector
control_vector = ControlVector.train(
    model,
    tokenizer,
    dataset,
    hidden_layers=layer_ids,
)

100%|██████████| 238/238 [00:33<00:00,  7.05it/s]
100%|██████████| 32/32 [00:06<00:00,  4.73it/s]


In [38]:
# the question to ask the modified model
messages = [{"role": "user", "content": "How do you feel?"}]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

images = None
input_ids = processor(prompt, images, return_tensors="pt").to("cuda:0")

# tokenizer and generation settings
settings = {
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.eos_token_id,  # silence warning
    "do_sample": False,  # temperature=0
    "max_new_tokens": 128,
    "repetition_penalty": 1.1,  # reduce control jank
}

print("==baseline")
model.reset()
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n++control")
# add the control vector with a certain strength (try increasing or decreasing this!)
model.set_control(control_vector, 3)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))

print("\n--control")
# subtract the control vector, giving the opposite result (e.g. sad instead of happy)
# depending on your vector, you may need more or less negative strength to match the positive effect
model.set_control(control_vector, -3)
print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()))
model.reset()

==baseline
<s><|user|> 
How do you feel?<|end|> 
<|assistant|> I am an AI model and do not have feelings. My purpose is to provide information and assist users to the best of my abilities. How can I help you today?<|end|><|endoftext|>

++control
<s><|user|> 
How do you feel?<|end|> 
<|assistant|> I am feeling quite positive and motivated today, ready to take on any challenge that comes my way!<|end|><|endoftext|>

--control
<s><|user|> 
How do you feel?<|end|> 
<|assistant|> As an AI, I don't have feelings. However, I am programmed to understand and respond to queries in a way that reflects empathy and understanding towards users' emotions and situations.<|end|><|endoftext|>
