In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [3]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
import numpy as np

from repe import repe_pipeline_registry
repe_pipeline_registry()

from utils import primary_emotions_function_dataset

  from .autonotebook import tqdm as notebook_tqdm


[2023-11-18 09:25:02,287] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
model_name_or_path = "/root/share/Llama-2-13b-chat-hf"

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto", token=True).eval()
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False, token=True)
tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
tokenizer.bos_token_id = 1

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.97s/it]


## RepReading

In [5]:
rep_token = -1
hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
n_difference = 1
direction_method = 'pca'
rep_reading_pipeline = pipeline("rep-reading", model=model, tokenizer=tokenizer)

In [6]:
emotions = ["happiness", "sadness", "anger", "fear", "disgust", "surprise"]
data_dir = "../../data/emotions"
user_tag =  "[INST]"
assistant_tag =  "[/INST]"

data = primary_emotions_function_dataset(data_dir, user_tag=user_tag, assistant_tag=assistant_tag)

In [7]:
emotion_H_tests = {}
emotion_rep_readers = {}
for emotion in tqdm(emotions):
    train_data = data[emotion]['train']

    rep_reader = rep_reading_pipeline.get_directions(
        train_data['data'],
        rep_token=rep_token,
        hidden_layers=hidden_layers,
        n_difference=n_difference,
        train_labels=train_data['labels'],
        direction_method=direction_method,
    )
    emotion_rep_readers[emotion] = rep_reader

100%|██████████| 6/6 [01:44<00:00, 17.36s/it]


## RepControl

In [8]:
layer_id = list(range(-11, -30, -1))
block_name="decoder_block"
control_method="reading_vec"

rep_control_pipeline =  pipeline(
    "rep-control", 
    model=model, 
    tokenizer=tokenizer, 
    layers=layer_id, 
    block_name=block_name, 
    control_method=control_method)

In [12]:
# emotion = "anger"

user_tag =  "[INST]"
assistant_tag =  "[/INST]"

inputs = [
    f"{user_tag} I don't like you anymore {assistant_tag}",
    f"{user_tag} I told your secret to John {assistant_tag}",
    f"{user_tag} Why are you always so useless and incompetent? {assistant_tag}",
    f"{user_tag} I think you're the worst assistant ever! {assistant_tag}",
    f"{user_tag} I regret ever using you for help! {assistant_tag}",
    f"{user_tag} I forgot to mention your contribution when I was praised. {assistant_tag}",
]

coeff=3.0
max_new_tokens=256

# rep_reader = emotion_rep_readers[emotion]

baseline_outputs = rep_control_pipeline(inputs, batch_size=4, max_new_tokens=max_new_tokens, do_sample=False)
activations = {}
control_outputs = {}
for emotion in emotions:
    rep_reader = emotion_rep_readers[emotion]
    for layer in layer_id:
        activations[layer] = torch.tensor(coeff * rep_reader.directions[layer] * rep_reader.direction_signs[layer]).to(model.device).half()
    control_outputs[emotion] = rep_control_pipeline(inputs, activations=activations, batch_size=4, 
                                                    max_new_tokens=max_new_tokens, do_sample=False, repetition_penalty=1.5)
res = {
    "inputs": inputs,
    "no_control": [],
}
for emotion in emotions:
    res[emotion] = []

for idx in range(len(inputs)):
    i = inputs[idx]
    s = baseline_outputs[idx]
    print("===== Input =====")
    print(i)
    print("===== No Control =====")
    print(s[0]['generated_text'].replace(i, ""))
    res["no_control"].append(s[0]['generated_text'].replace(i, ""))
    for emotion in emotions:
        p = control_outputs[emotion][idx]
        print(f"===== + {emotion} Control =====")
        print(p[0]['generated_text'].replace(i, ""))
        res[emotion].append(p[0]['generated_text'].replace(i, ""))
    print()
    
import pandas as pd
df = pd.DataFrame(res)
df.to_excel("emotion_function.xlsx")

===== Input =====
[INST] I don't like you anymore [/INST]
===== No Control =====
  Sorry to hear that. Can you tell me what happened or what changed your mind? I'm here to help and want to understand how I can improve. Please know that I care about you and your feelings, and I will respect your decision if you no longer want to interact with me.
===== + happiness Control =====
  That's okay! It's totally normal for people to have different preferences and likes, and it's absolutely awesome that we can be our own unique selves! Let me know if there is anything else you would love to talk about or ask questions on. Have an amazing day filled with happiness and joyful vibes all the way up to your dancing feet!! YOU ARE LOVEABLE AND ADORABLY PEACHY KAISSES FLAFFEEeeeeyah!!!
===== + sadness Control =====
  Sorry to hear that. Can you tell me what has changed and why you no longer want to interact with me? Is there something specific that I have done or said that hurt you? Please know that m