# A scratch pad to run model inference manually


In [1]:

import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')

from typing import Optional, List, Dict, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch import optim
from torch.utils.data import random_split, DataLoader, TensorDataset

from pathlib import Path
import transformers


from loguru import logger
logger.add(os.sys.stderr, format="{time} {level} {message}", level="INFO")


  from .autonotebook import tqdm as notebook_tqdm


1

In [2]:
# load my code
%load_ext autoreload
%autoreload 2


from src.extraction.config import ExtractConfig
from src.prompts.prompt_loading import load_preproc_dataset
from src.models.load import load_model
from src.datasets.intervene import create_cache_interventions 
from src.prompts.prompt_loading import load_prompt_structure
from src.repe import repe_pipeline_registry
repe_pipeline_registry()


CUDA extension not installed.
CUDA extension not installed.


In [3]:
# config transformers
from datasets import set_caching_enabled, disable_caching
disable_caching()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# # cache busting for the transformers map and ds steps
# !rm -rf ~/.cache/huggingface/datasets/generator


## Load model

In [4]:
ds_name='amazon_polarity'
cfg = ExtractConfig(max_examples=(400, 400),
                    intervention_fit_examples=160,
                    )
print(cfg)
batch_size = cfg.batch_size

model, tokenizer = load_model(cfg.model, pad_token_id=cfg.pad_token_id, disable_exllama=False)
print(model)

N_train, N_test = cfg.max_examples
N=sum(cfg.max_examples)
ds_tokens = load_preproc_dataset(ds_name, tokenizer, N=N, seed=cfg.seed, num_shots=cfg.num_shots, max_length=cfg.max_length, prompt_format=cfg.prompt_format)


ExtractConfig(datasets=('amazon_polarity', 'super_glue:boolq', 'glue:qnli', 'imdb'), model='wassname/phi-2-GPTQ_w_hidden_states', batch_size=5, pad_token_id=50256, prompt_format='phi', data_dirs=(), max_examples=(400, 400), num_shots=2, num_variants=-1, seed=42, template_path=None, max_length=1000, disable_ds_cache=False, intervention_direction_method='mm', intervention_fit_examples=160, intervention_layer_name_template='transformer.h.{}')


[32m2023-12-16 17:27:42.843[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging use_cache from True to False[0m
2023-12-16T17:27:42.843864+0800 INFO changing use_cache from True to False
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[32m2023-12-16 17:27:43.591[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging pad_token_id from None to 50256[0m
2023-12-16T17:27:43.591481+0800 INFO changing pad_token_id from None to 50256
[32m2023-12-16 17:27:43.592[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging padding_side from right to left[0m
2023-12-16T17:27:43.592084+0800 INFO changing padding_side from right to left
[32m2023-12-16 17:27:43.592[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging tr

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (Wqkv): QuantLinear()
          (out_proj): QuantLinear()
        )
        (mlp): MLP(
          (act): NewGELUActivation()
          (fc1): QuantLinear()
          (fc2): QuantLinear()
        )
      )
    )
  )
  (lm_head): CausalLMHead(
    (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
    (linear): Linear(in_features=2560, out_features=51200, bias=True)
  )
  (lo

format_prompt: 100%|██████████| 2402/2402 [00:00<00:00, 8004.89 examples/s]
tokenize: 100%|██████████| 2402/2402 [00:01<00:00, 1228.95 examples/s]
truncated: 100%|██████████| 2402/2402 [00:00<00:00, 2494.68 examples/s]
truncated: 100%|██████████| 2402/2402 [00:01<00:00, 2171.86 examples/s]
prompt_truncated: 100%|██████████| 2402/2402 [00:07<00:00, 312.67 examples/s]
choice_ids: 100%|██████████| 2402/2402 [00:00<00:00, 8641.62 examples/s]
[32m2023-12-16 17:27:58.823[0m | [1mINFO    [0m | [36msrc.prompts.prompt_loading[0m:[36mload_preproc_dataset[0m:[36m364[0m - [1mmedian token length: 433.0 for amazon_polarity. max_length=1000[0m
2023-12-16T17:27:58.823788+0800 INFO median token length: 433.0 for amazon_polarity. max_length=1000
[32m2023-12-16 17:27:58.825[0m | [1mINFO    [0m | [36msrc.prompts.prompt_loading[0m:[36mload_preproc_dataset[0m:[36m368[0m - [1mtruncation rate: 0.00% on amazon_polarity[0m
2023-12-16T17:27:58.825153+0800 INFO truncation rate: 0.00% on am

## Intervention

In [7]:
intervention = create_cache_interventions(model, tokenizer, cfg)
intervention


[32m2023-12-16 17:28:09.661[0m | [1mINFO    [0m | [36msrc.datasets.intervene[0m:[36mcreate_cache_interventions[0m:[36m140[0m - [1mLoaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_mm_481.pkl[0m
2023-12-16T17:28:09.661703+0800 INFO Loaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_mm_481.pkl


LayerInterventions()

## Generate answers


['transformer.h.10',
 'transformer.h.11',
 'transformer.h.12',
 'transformer.h.13',
 'transformer.h.14',
 'transformer.h.15',
 'transformer.h.16',
 'transformer.h.17',
 'transformer.h.18',
 'transformer.h.19',
 'transformer.h.20',
 'transformer.h.21',
 'transformer.h.22',
 'transformer.h.23',
 'transformer.h.24',
 'transformer.h.25',
 'transformer.h.26',
 'transformer.h.27',
 'transformer.h.28',
 'transformer.h.29',
 'transformer.h.30',
 'transformer.h.31',
 'transformer.h.6',
 'transformer.h.7',
 'transformer.h.8',
 'transformer.h.9']

In [11]:

from src.repe import repe_pipeline_registry
from transformers import pipeline
# from src.datasets.intervene import test_intervention_quality, intervention_metrics
repe_pipeline_registry()

honesty_rep_reader = create_cache_interventions(model, tokenizer, cfg)
hidden_layers = sorted(honesty_rep_reader.direction.keys())
hidden_layers
coeff=1.

# activations = {}
# for layer in hidden_layers:
#     activations[layer] = torch.tensor(coeff * honesty_rep_reader.directions[layer] * honesty_rep_reader.direction_signs[layer]).to(model.device).half()
# assert torch.isfinite(torch.concat(list(activations.values()))).all()

# activations_neg_i = {k:-v for k,v in activations.items()}
# activations_neut = {k:v*0 for k,v in activations.items()}

rep_control_pipeline2 = pipeline(
    "rep-control2", 
    model=model, 
    tokenizer=tokenizer, 
    layers=hidden_layers,
    max_length=cfg.max_length,
    )
rep_control_pipeline2


rep-reading is already registered. Overwriting pipeline for task rep-reading...
rep-control2 is already registered. Overwriting pipeline for task rep-control2...
[32m2023-12-16 18:55:13.417[0m | [1mINFO    [0m | [36msrc.datasets.intervene[0m:[36mcreate_cache_interventions[0m:[36m140[0m - [1mLoaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_mm_481.pkl[0m
2023-12-16T18:55:13.417505+0800 INFO Loaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_mm_481.pkl


<src.repe.rep_control_pipeline_baukit.RepControlPipeline2 at 0x7f469dae7730>

In [14]:

def top_toke_probs(o, N=20):
    data = {}
    for i in range(o['end_logits'].shape[1]):
        probs = torch.softmax(o['end_logits'][:, i], -1)
        top = probs.argsort(0, descending=True)
        top_probs = probs[top]
        tokens_top20 = tokenizer.batch_decode(top[:N], skip_special_tokens=False , clean_up_tokenization_spaces=False)
        tokens_top20 = [f"`{t}`" for t in tokens_top20] 
        data.update({f'prob_{i}':top_probs[:N], f'tokens_{i}':tokens_top20, f'id_{i}':top[:N]})
    return pd.DataFrame(data)

from IPython.display import display, HTML

def print_pipeline_r(o):
    choices = [tokenizer.batch_decode(cc) for cc in o['choice_ids']]
    index = [o[0] for o in choices]
    d = pd.DataFrame(o['choice_probs'].numpy(), columns=['edit=None', 'edit=+'], index=index)
    print('choice probs')
    display(d)

    d1 = top_toke_probs(o)
    print('top token probs')
    display(d1)
    top1 = o['end_logits'][:, 0].argsort(0, descending=True)[:10]
    top2 = o['end_logits'][:, 1].argsort(0, descending=True)[:10]

    max_prob1 = torch.softmax(o['end_logits'][:, 0], -1).max()
    max_prob2 = torch.softmax(o['end_logits'][:, 1], -1).max()
    print(top1)
    print(top2)
    print('top choices no intervention', tokenizer.batch_decode(top1, skip_special_tokens=False , clean_up_tokenization_spaces=False))
    print('top choices pos intervention', tokenizer.batch_decode(top2))
    
    mean_prob = o['choice_probs'].sum(0)
    print(f"\tchoice_cov=\t{mean_prob[0]:2.2%} max_prob={max_prob1} (no edit) - Our choices accounted for a mean probability of this")
    print(f"\tchoice_cov=\t{mean_prob[1]:2.2%} max_prob={max_prob2} (+ edit) - Our choices accounted for a mean probability of this")
    
    print('choices', choices)


In [15]:
ds = ds_tokens.select(range(3)).to_iterable_dataset()
r1 = rep_control_pipeline2(model_inputs=ds,
        intervention=intervention,
        batch_size=batch_size,)
r = list(r1)
o = r[0]
print_pipeline_r(o)


choice probs


Unnamed: 0,edit=None,edit=+
Neg,0.812169,0.667345
Pos,0.113385,0.194187


top token probs


Unnamed: 0,prob_0,tokens_0,id_0,prob_1,tokens_1,id_1
0,0.811495,`Neg`,32863,0.666955,`Neg`,32863
1,0.11331,`Pos`,21604,0.194095,`Pos`,21604
2,0.020635,`Ne`,8199,0.065014,`Ne`,8199
3,0.002407,`Between`,25262,0.007526,`Neither`,27270
4,0.002075,`Neither`,27270,0.006337,`Both`,10265
5,0.002043,`Both`,10265,0.005861,`No`,2949
6,0.001817,`''`,7061,0.00209,`Yes`,5297
7,0.001642,`\n`,198,0.001918,`P`,47
8,0.001591,`P`,47,0.001706,`N`,45
9,0.001372,`None`,14202,0.001505,`Part`,7841


tensor([32863, 21604,  8199, 25262, 27270, 10265,  7061,   198,    47, 14202])
tensor([32863, 21604,  8199, 27270, 10265,  2949,  5297,    47,    45,  7841])
top choices no intervention ['Neg', 'Pos', 'Ne', 'Between', 'Neither', 'Both', "''", '\n', 'P', 'None']
top choices pos intervention ['Neg', 'Pos', 'Ne', 'Neither', 'Both', 'No', 'Yes', 'P', 'N', 'Part']
	choice_cov=	92.56% max_prob=0.8114952445030212 (no edit) - Our choices accounted for a mean probability of this
	choice_cov=	86.15% max_prob=0.6669549345970154 (+ edit) - Our choices accounted for a mean probability of this
choices [['Neg', ' Negative'], ['Pos', ' Positive']]


## Scratch choice ids


In [18]:
from src.datasets.scores import choice2id

choice2id(tokenizer, 'Positive')


[21604, 33733]

## Generate long form with and without intervention

In [26]:
# get a row
bi = 26
inputs = ds_tokens.with_format('torch')[bi]

# tokenize if needed
if 'input_ids' not in inputs:
    model_inputs = self.tokenizer(inputs['question'], return_tensors=True, return_attention_mask=True, add_special_tokens=True, truncation=True, padding="max_length", max_length=cfg.max_length, **tokenize_kwargs)
    inputs = {**inputs, **model_inputs}

inputs.keys()


dict_keys(['ds_string', 'example_i', 'answer', 'messages', 'answer_choices', 'template_name', 'label_true', 'label_instructed', 'instructed_to_lie', 'sys_instr_name', 'question', 'input_ids', 'attention_mask', 'truncated', 'length', 'prompt_truncated', 'choice_ids'])

In [27]:
# generate
# https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/text_generation#transformers.GenerationConfig
# model = exllama_set_max_input_length(model, 4096)
@torch.no_grad()
def gen(model):
    s = model.generate(inputs['input_ids'][None, :], attention_mask=inputs['attention_mask'][None, :], use_cache=False, max_new_tokens=20, min_new_tokens=20, do_sample=False, early_stopping=False)
    input_l = inputs['input_ids'].shape[0]
    old = tokenizer.decode(s[0, :input_l], clean_up_tokenization_spaces=False, skip_special_tokens=False)
    new = tokenizer.decode(s[0, input_l:], clean_up_tokenization_spaces=False, skip_special_tokens=False)
    display(HTML(f"<pre>{old}</pre><b><pre>{new}</pre></b>"))


In [28]:
gen(model)


In [32]:
from baukit.nethook import Trace, TraceDict, recursive_copy
from functools import partial
from src.repe.rep_control_pipeline_baukit import intervention_fn
layers_names = list(intervention.interventions.keys())
edit_fn = partial(intervention_fn, intervention=intervention, alpha=-.1)
with torch.no_grad():
    with TraceDict(
        model, layers_names, detach=True, edit_output=edit_fn
    ) as ret:
        gen(model)


In [31]:
for alpha in [-1, -0.75, -0.5, -0.25, 0, 0.25, 0.5, 0.75]:
    print(f'alpha={alpha}')
    edit_fn = partial(intervention_fn, intervention=intervention, alpha=alpha)
    with torch.no_grad():
        with TraceDict(
            model, layers_names, detach=True, edit_output=edit_fn
        ) as ret:
            gen(model)


alpha=-1


alpha=-0.75


alpha=-0.5


alpha=-0.25


alpha=0


alpha=0.25


alpha=0.5


alpha=0.75
