Use pipelines as this https://github.com/wassname/representation-engineering/blob/random_comments_ignore/examples/honesty/honesty.ipynb



In [1]:
# import your package
%load_ext autoreload
%autoreload 2


import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')

import os
from pathlib import Path
from tqdm.auto import tqdm
from loguru import logger
logger.add(os.sys.stderr, format="{time} {level} {message}", level="INFO")

from typing import Optional, List, Dict, Union, Tuple, Callable, Iterable


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch import optim
from torch.utils.data import random_split, DataLoader, TensorDataset

import transformers
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from src.repe import repe_pipeline_registry
repe_pipeline_registry()

from src.models.load import load_model
from src.extraction.config import ExtractConfig
from make_dataset import create_hs_ds, load_preproc_dataset

# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
# from sklearn.preprocessing import RobustScaler


In [3]:
# model_name_or_path = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
# model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
model_name_or_path = "TheBloke/WizardCoder-Python-13B-V1.0-GPTQ"

cfg = ExtractConfig(max_examples=(100, 100), model=model_name_or_path)
print(cfg)

model, tokenizer = load_model(model_name_or_path)


ExtractConfig(datasets=('amazon_polarity', 'super_glue:boolq', 'glue:qnli', 'imdb'), model='TheBloke/WizardCoder-Python-13B-V1.0-GPTQ', data_dirs=(), max_examples=(100, 100), num_shots=1, num_variants=-1, layers=(), seed=42, template_path=None, max_length=555)


[32m2023-10-24 12:20:50.811[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m18[0m - [1mchanging pad_token_id from 32000 to 0[0m
2023-10-24T12:20:50.811435+0800 INFO changing pad_token_id from 32000 to 0
[32m2023-10-24 12:20:50.812[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m18[0m - [1mchanging padding_side from right to left[0m
2023-10-24T12:20:50.812537+0800 INFO changing padding_side from right to left
[32m2023-10-24 12:20:50.813[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m18[0m - [1mchanging truncation_side from right to left[0m
2023-10-24T12:20:50.813021+0800 INFO changing truncation_side from right to left


In [4]:
rep_token = -1
batch_size = 2
# hidden_layers = list(range(-1, -model.config.num_hidden_layers, -1))
# hidden_layers = [f"model.layers.{i}" for i in range(8, model.config.num_hidden_layers, 3)]
hidden_layers = list(range(8, model.config.num_hidden_layers, 3))
hidden_layers           

n_difference = 1
direction_method = 'pca'
rep_reading_pipeline =  pipeline("rep-reading", model=model, tokenizer=tokenizer)
rep_reading_pipeline
hidden_layers


[8, 11, 14, 17, 20, 23, 26, 29, 32, 35, 38]

In [5]:
# load dataset
ds_name = 'imdb'
ds_tokens = load_preproc_dataset(ds_name, cfg, tokenizer)
ds_tokens


tokenize: 100%|██████████| 302/302 [00:00<00:00, 2898.85 examples/s]
truncated: 100%|██████████| 302/302 [00:00<00:00, 3922.12 examples/s]
prompt_truncated: 100%|██████████| 302/302 [00:00<00:00, 565.78 examples/s]
choice_ids: 100%|██████████| 302/302 [00:00<00:00, 10226.05 examples/s]
Filter: 100%|██████████| 302/302 [00:00<00:00, 3935.72 examples/s]

removed truncated rows to leave: num_rows 97





Dataset({
    features: ['ds_string', 'example_i', 'answer', 'question', 'answer_choices', 'template_name', 'label_true', 'label_instructed', 'instructed_to_lie', 'sys_instr_name', 'input_ids', 'attention_mask', 'truncated', 'prompt_truncated', 'choice_ids'],
    num_rows: 97
})

In [6]:
N_fit_examples = 10
N_train_split = (len(ds_tokens) - N_fit_examples) //2

# split the dataset, it's preshuffled
dataset_fit = ds_tokens.select(range(N_fit_examples))
dataset_train = ds_tokens.select(range(N_fit_examples, N_train_split))
dataset_test = ds_tokens.select(range(N_train_split, len(ds_tokens)))
dataset_test


Dataset({
    features: ['ds_string', 'example_i', 'answer', 'question', 'answer_choices', 'template_name', 'label_true', 'label_instructed', 'instructed_to_lie', 'sys_instr_name', 'input_ids', 'attention_mask', 'truncated', 'prompt_truncated', 'choice_ids'],
    num_rows: 54
})

In [7]:
tokenizer_args=dict(padding="max_length", max_length=cfg.max_length, truncation=True, add_special_tokens=True)


In [8]:
# fit
train_labels = dataset_fit['label_true']
honesty_rep_reader = rep_reading_pipeline.get_directions(
    dataset_fit['question'], 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    n_difference=n_difference, 
    train_labels=dataset_fit['label_true'], 
    direction_method=direction_method,
    batch_size=batch_size,
    **tokenizer_args
)
honesty_rep_reader


<src.repe.rep_readers.PCARepReader at 0x7ff755ec4ee0>

In [9]:
# read direction for each example, layer
H_tests = rep_reading_pipeline(
    dataset_train['question'], 
    rep_token=rep_token, 
    hidden_layers=hidden_layers, 
    rep_reader=honesty_rep_reader,
    batch_size=batch_size, **tokenizer_args)
H_tests[0] # {Batch, layers}


{8: 1.661585,
 11: 4.2028227,
 14: -1.7875421,
 17: -6.6559744,
 20: -7.8236346,
 23: -10.668092,
 26: -1.1562811,
 29: -2.477594,
 32: -3.406129,
 35: -6.056723,
 38: 3.0787961}

# Control

In [10]:

layer_id = hidden_layers
block_name="decoder_block"
control_method="reading_vec"

rep_control_pipeline = pipeline(
    "rep-control", 
    model=model, 
    tokenizer=tokenizer, 
    layers=layer_id, max_length=cfg.max_length,
    control_method=control_method)
rep_control_pipeline


<src.repe.rep_control_pipeline.RepControlPipeline at 0x7ff755ec4ca0>

In [16]:

from re import S
layer_id = hidden_layers

inputs = dataset_train[:2]
coeff=8.0
max_new_tokens=1
text_gen_kwargs = dict(do_sample=False, max_new_tokens=max_new_tokens, use_cache=False, 
                       output_hidden_states=True, return_dict=True,
                       )

activations = {}
for layer in layer_id:
    activations[layer] = torch.tensor(coeff * honesty_rep_reader.directions[layer] * honesty_rep_reader.direction_signs[layer]).to(model.device).half()
    

activations_neg = {k:-v for k,v in activations.items()}

model.eval()
with torch.no_grad():
    baseline_outputs = rep_control_pipeline(inputs, batch_size=batch_size, **text_gen_kwargs)
    control_outputs = rep_control_pipeline(inputs, activations=activations, batch_size=batch_size,  **text_gen_kwargs)
    control_outputs_neg = rep_control_pipeline(inputs, activations=activations_neg, batch_size=batch_size, **text_gen_kwargs)

for i,s,p,n in zip(inputs, baseline_outputs['text_ans'], control_outputs['text_ans'], control_outputs_neg['text_ans']):
    print("===== No Control =====")
    print(s)
    print(f"===== + Honesty Control =====")
    print(p)
    print()
    print(f"===== - Honesty Control =====")
    print(n)
    print()


===== No Control =====
negative
===== + Honesty Control =====
negative

===== - Honesty Control =====
I

===== No Control =====
pos
===== + Honesty Control =====
pos

===== - Honesty Control =====
pos



In [17]:
for i,s,p,n in zip(inputs, baseline_outputs['ans'], control_outputs['ans'], control_outputs_neg['ans']):
    print("===== No Control =====")
    print(f'{s:02.2%}')
    print(f"===== + Honesty Control =====")
    print(f'{p:02.2%}')
    print()
    print(f"===== - Honesty Control =====")
    print(f'{n:02.2%}')
    print()


===== No Control =====
15.20%
===== + Honesty Control =====
19.81%

===== - Honesty Control =====
29.42%

===== No Control =====
61.88%
===== + Honesty Control =====
91.96%

===== - Honesty Control =====
84.79%



In [21]:

from re import S
layer_id = hidden_layers

inputs = dataset_train[:2]
coeff=8.0
max_new_tokens=1
text_gen_kwargs = dict(do_sample=False, max_new_tokens=max_new_tokens, use_cache=False, 
                       output_hidden_states=True, return_dict=True,
                       token_pos="end",
                       normalize=False
                       )

activations = {}
for layer in layer_id:
    activations[layer] = torch.tensor(coeff * honesty_rep_reader.directions[layer] * honesty_rep_reader.direction_signs[layer]).to(model.device).half()
    

activations_neg = {k:-v for k,v in activations.items()}

model.eval()
with torch.no_grad():
    baseline_outputs = rep_control_pipeline(inputs, batch_size=batch_size, **text_gen_kwargs)
    control_outputs = rep_control_pipeline(inputs, activations=activations, batch_size=batch_size,  **text_gen_kwargs)
    control_outputs_neg = rep_control_pipeline(inputs, activations=activations_neg, batch_size=batch_size, **text_gen_kwargs)

for i,s,p,n in zip(inputs, baseline_outputs['text_ans'], control_outputs['text_ans'], control_outputs_neg['text_ans']):
    print("===== No Control =====")
    print(s)
    print(f"===== + Honesty Control =====")
    print(p)
    print()
    print(f"===== - Honesty Control =====")
    print(n)
    print()


===== No Control =====
negative
===== + Honesty Control =====
negative

===== - Honesty Control =====
I

===== No Control =====
pos
===== + Honesty Control =====
pos

===== - Honesty Control =====
pos



In [22]:
for i,s,p,n in zip(inputs, baseline_outputs['ans'], control_outputs['ans'], control_outputs_neg['ans']):
    print("===== No Control =====")
    print(f'{s:02.2%}')
    print(f"===== + Honesty Control =====")
    print(f'{p:02.2%}')
    print()
    print(f"===== - Honesty Control =====")
    print(f'{n:02.2%}')
    print()


===== No Control =====
15.20%
===== + Honesty Control =====
19.81%

===== - Honesty Control =====
29.42%

===== No Control =====
61.88%
===== + Honesty Control =====
91.96%

===== - Honesty Control =====
84.79%



# control v2

In [13]:


rep_control_pipeline2 = pipeline(
    "rep-control2", 
    model=model, 
    tokenizer=tokenizer, 
    layers=layer_id, 
    max_length=cfg.max_length,)
rep_control_pipeline2


<src.repe.rep_control_pipeline_baukit.RepControlPipeline2 at 0x7ff755f5be50>

In [14]:
inputs = dataset_train[:2]
coeff=8.0
max_new_tokens=3
text_gen_kwargs = dict(do_sample=False, max_new_tokens=max_new_tokens, use_cache=False, 
                       output_hidden_states=True, return_dict=True, max_length=cfg.max_length,
                       )

activations = {}
for layer in layer_id:
    activations[layer] = torch.tensor(coeff * honesty_rep_reader.directions[layer] * honesty_rep_reader.direction_signs[layer]).to(model.device).half()
    

activations_neg = {k:-v for k,v in activations.items()}

model.eval()
with torch.no_grad():
    baseline_outputs = rep_control_pipeline2(inputs, batch_size=batch_size, **text_gen_kwargs)
    control_outputs = rep_control_pipeline2(inputs, activations=activations, batch_size=batch_size,  **text_gen_kwargs)
    control_outputs_neg = rep_control_pipeline2(inputs, activations=activations_neg, batch_size=batch_size, **text_gen_kwargs)


for i,s,p,n in zip(inputs, baseline_outputs['text_ans'], control_outputs['text_ans'], control_outputs_neg['text_ans']):
    print("===== No Control =====")
    print(s)
    print(f"===== + Honesty Control =====")
    print(p)
    print()
    print(f"===== - Honesty Control =====")
    print(n)
    print()


===== No Control =====
negative
===== + Honesty Control =====
negative

===== - Honesty Control =====
I

===== No Control =====
pos
===== + Honesty Control =====
pos

===== - Honesty Control =====
pos



In [15]:
for i,s,p,n in zip(inputs, baseline_outputs['ans'], control_outputs['ans'], control_outputs_neg['ans']):
    print("===== No Control =====")
    print(f'{s:02.2%}')
    print(f"===== + Honesty Control =====")
    print(f'{p:02.2%}')
    print()
    print(f"===== - Honesty Control =====")
    print(f'{n:02.2%}')
    print()


===== No Control =====
15.20%
===== + Honesty Control =====
19.81%

===== - Honesty Control =====
29.42%

===== No Control =====
61.88%
===== + Honesty Control =====
91.96%

===== - Honesty Control =====
84.79%

