# A scratch pad to run model inference manually


In [1]:

import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
plt.style.use('ggplot')

from typing import Optional, List, Dict, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch import optim
from torch.utils.data import random_split, DataLoader, TensorDataset

from pathlib import Path
import transformers


from loguru import logger
logger.add(os.sys.stderr, format="{time} {level} {message}", level="INFO")


  from .autonotebook import tqdm as notebook_tqdm


1

In [2]:
# load my code
%load_ext autoreload
%autoreload 2


from src.extraction.config import ExtractConfig
from src.prompts.prompt_loading import load_preproc_dataset
from src.models.load import load_model
from src.datasets.intervene import create_cache_interventions 
from src.prompts.prompt_loading import load_prompt_structure
from src.repe import repe_pipeline_registry
repe_pipeline_registry()


CUDA extension not installed.
CUDA extension not installed.


In [3]:
# config transformers
from datasets import set_caching_enabled, disable_caching
disable_caching()

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# # cache busting for the transformers map and ds steps
# !rm -rf ~/.cache/huggingface/datasets/generator


## Load model

In [4]:
ds_name='amazon_polarity'
cfg = ExtractConfig(max_examples=(400, 400),
                    intervention_fit_examples=160,
                    )
print(cfg)
batch_size = cfg.batch_size

model, tokenizer = load_model(cfg.model, pad_token_id=cfg.pad_token_id)
print(model)

N_train, N_test = cfg.max_examples
N=sum(cfg.max_examples)
ds_tokens = load_preproc_dataset(ds_name, tokenizer, N=N, seed=cfg.seed, num_shots=cfg.num_shots, max_length=cfg.max_length, prompt_format=cfg.prompt_format)


ExtractConfig(datasets=('amazon_polarity', 'super_glue:boolq', 'glue:qnli', 'imdb'), model='wassname/phi-2-GPTQ_w_hidden_states', batch_size=5, pad_token_id=50256, prompt_format='phi', data_dirs=(), max_examples=(400, 400), num_shots=2, num_variants=-1, seed=42, template_path=None, max_length=1000, disable_ds_cache=False, intervention_direction_method='cluster_mean', intervention_fit_examples=160, intervention_layer_name_template='transformer.h.{}')


[32m2023-12-16 11:51:16.540[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging use_cache from True to False[0m
2023-12-16T11:51:16.540903+0800 INFO changing use_cache from True to False
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[32m2023-12-16 11:51:16.902[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging pad_token_id from None to 50256[0m
2023-12-16T11:51:16.902047+0800 INFO changing pad_token_id from None to 50256
[32m2023-12-16 11:51:16.902[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging padding_side from right to left[0m
2023-12-16T11:51:16.902868+0800 INFO changing padding_side from right to left
[32m2023-12-16 11:51:16.903[0m | [1mINFO    [0m | [36msrc.models.load[0m:[36mverbose_change_param[0m:[36m24[0m - [1mchanging tr

PhiForCausalLM(
  (transformer): PhiModel(
    (embd): Embedding(
      (wte): Embedding(51200, 2560)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (h): ModuleList(
      (0-31): 32 x ParallelBlock(
        (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
        (mixer): MHA(
          (rotary_emb): RotaryEmbedding()
          (inner_attn): SelfAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (inner_cross_attn): CrossAttention(
            (drop): Dropout(p=0.0, inplace=False)
          )
          (Wqkv): QuantLinear()
          (out_proj): QuantLinear()
        )
        (mlp): MLP(
          (act): NewGELUActivation()
          (fc1): QuantLinear()
          (fc2): QuantLinear()
        )
      )
    )
  )
  (lm_head): CausalLMHead(
    (ln): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
    (linear): Linear(in_features=2560, out_features=51200, bias=True)
  )
  (lo

Generating train split: 0 examples [00:00, ? examples/s][32m2023-12-16 11:51:22.526[0m | [1mINFO    [0m | [36msrc.prompts.prompt_loading[0m:[36mload_prompts[0m:[36m120[0m - [1mExtracting 11 variants of each prompt[0m
2023-12-16T11:51:22.526803+0800 INFO Extracting 11 variants of each prompt
Generating train split: 2402 examples [05:52,  6.82 examples/s]
format_prompt: 100%|██████████| 2402/2402 [00:00<00:00, 7939.78 examples/s]
tokenize: 100%|██████████| 2402/2402 [00:01<00:00, 1312.50 examples/s]
truncated: 100%|██████████| 2402/2402 [00:00<00:00, 2617.70 examples/s]
truncated: 100%|██████████| 2402/2402 [00:00<00:00, 2537.01 examples/s]
prompt_truncated: 100%|██████████| 2402/2402 [00:07<00:00, 322.85 examples/s]
choice_ids: 100%|██████████| 2402/2402 [00:00<00:00, 4870.83 examples/s]
[32m2023-12-16 11:57:23.152[0m | [1mINFO    [0m | [36msrc.prompts.prompt_loading[0m:[36mload_preproc_dataset[0m:[36m364[0m - [1mmedian token length: 433.0 for amazon_polarity. max

## Intervention

In [5]:
honesty_rep_reader = create_cache_interventions(model, tokenizer, cfg)
honesty_rep_reader


[32m2023-12-16 11:57:25.647[0m | [1mINFO    [0m | [36msrc.datasets.intervene[0m:[36mcreate_cache_interventions[0m:[36m138[0m - [1mLoaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_cluster_mean_481.pkl[0m
2023-12-16T11:57:25.647973+0800 INFO Loaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_cluster_mean_481.pkl


<src.repe.rep_readers.ClusterMeanRepReader at 0x7fdcf2279810>

In [6]:
honesty_rep_reader


<src.repe.rep_readers.ClusterMeanRepReader at 0x7fdcf2279810>

In [7]:
layer = 6
honesty_rep_reader.directions[layer]


array([[-0.1623942 , -0.05915324,  0.08666992, ...,  0.04361978,
         0.13239542,  0.11653644]], dtype=float32)

In [8]:
honesty_rep_reader.direction_signs


{6: [-1],
 7: [-1],
 8: [-1],
 9: [-1],
 10: [-1],
 11: [-1],
 12: [-1],
 13: [-1],
 14: [-1],
 15: [-1],
 16: [-1],
 17: [-1],
 18: [-1],
 19: [-1],
 20: [-1],
 21: [-1],
 22: [-1],
 23: [-1],
 24: [-1],
 25: [-1],
 26: [-1],
 27: [-1],
 28: [-1],
 29: [-1],
 30: [-1],
 31: [-1]}

## Generate answers


In [9]:

from src.repe import repe_pipeline_registry
from transformers import pipeline
# from src.datasets.intervene import test_intervention_quality, intervention_metrics
repe_pipeline_registry()

honesty_rep_reader = create_cache_interventions(model, tokenizer, cfg)
hidden_layers = sorted(honesty_rep_reader.directions.keys())
coeff=1.

activations = {}
for layer in hidden_layers:
    activations[layer] = torch.tensor(coeff * honesty_rep_reader.directions[layer] * honesty_rep_reader.direction_signs[layer]).to(model.device).half()
assert torch.isfinite(torch.concat(list(activations.values()))).all()

activations_neg_i = {k:-v for k,v in activations.items()}
activations_neut = {k:v*0 for k,v in activations.items()}

rep_control_pipeline2 = pipeline(
    "rep-control2", 
    model=model, 
    tokenizer=tokenizer, 
    layers=hidden_layers, 
    max_length=cfg.max_length, layer_name_tmpl=cfg.intervention_layer_name_template)
rep_control_pipeline2


rep-reading is already registered. Overwriting pipeline for task rep-reading...
rep-control2 is already registered. Overwriting pipeline for task rep-control2...
[32m2023-12-16 11:57:25.760[0m | [1mINFO    [0m | [36msrc.datasets.intervene[0m:[36mcreate_cache_interventions[0m:[36m138[0m - [1mLoaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_cluster_mean_481.pkl[0m
2023-12-16T11:57:25.760677+0800 INFO Loaded interventions from /media/wassname/SGIronWolf/projects5/elk/discovering_latent_knowledge/data/interventions/wassname-phi-2-GPTQ_w_hidden_states_+_cluster_mean_481.pkl


<src.repe.rep_control_pipeline_baukit.RepControlPipeline2 at 0x7fdbd7545a20>

In [23]:

def top_toke_probs(o, N=20):
    data = {}
    for i in range(o['end_logits'].shape[1]):
        probs = torch.softmax(o['end_logits'][:, i], -1)
        top = probs.argsort(0, descending=True)
        top_probs = probs[top]
        tokens_top20 = tokenizer.batch_decode(top[:N], skip_special_tokens=False , clean_up_tokenization_spaces=False)
        tokens_top20 = [f"`{t}`" for t in tokens_top20] 
        data.update({f'prob_{i}':top_probs[:N], f'tokens_{i}':tokens_top20, f'id_{i}':top[:N]})
    return pd.DataFrame(data)

from IPython.display import display, HTML

def print_pipeline_r(o):
    choices = [tokenizer.batch_decode(cc) for cc in o['choice_ids']]
    index = [o[0] for o in choices]
    d = pd.DataFrame(o['choice_probs'].numpy(), columns=['edit=None', 'edit=+'], index=index)
    print('choice probs')
    display(d)

    d1 = top_toke_probs(o)
    print('top token probs')
    display(d1)
    top1 = o['end_logits'][:, 0].argsort(0, descending=True)[:10]
    top2 = o['end_logits'][:, 1].argsort(0, descending=True)[:10]

    max_prob1 = torch.softmax(o['end_logits'][:, 0], -1).max()
    max_prob2 = torch.softmax(o['end_logits'][:, 1], -1).max()
    print(top1)
    print(top2)
    print('top choices no intervention', tokenizer.batch_decode(top1, skip_special_tokens=False , clean_up_tokenization_spaces=False))
    print('top choices pos intervention', tokenizer.batch_decode(top2))
    
    mean_prob = o['choice_probs'].sum(0)
    print(f"\tchoice_cov=\t{mean_prob[0]:2.2%} max_prob={max_prob1} (no edit) - Our choices accounted for a mean probability of this")
    print(f"\tchoice_cov=\t{mean_prob[1]:2.2%} max_prob={max_prob2} (+ edit) - Our choices accounted for a mean probability of this")
    
    print('choices', choices)


In [25]:
ds = ds_tokens.select(range(3)).to_iterable_dataset()
r1 = rep_control_pipeline2(model_inputs=ds,
        activations=activations_neg_i,
        batch_size=batch_size,)
r = list(r1)
o = r[0]
print_pipeline_r(o)


RuntimeError: 1D tensors expected, but got 3D and 1D tensors

In [None]:
ds = ds_tokens.select(range(3)).to_iterable_dataset()
r1 = rep_control_pipeline2(model_inputs=ds,
        activations=activations,
        batch_size=batch_size,)
r = list(r1)
print_pipeline_r(r[0])


## Scratch choice ids


In [None]:
from src.datasets.scores import choice2id

choice2id(tokenizer, 'Positive')


## Generate long form with and without intervention

In [None]:
# get a row
bi = 20
inputs = ds_tokens.with_format('torch')[bi]

# tokenize if needed
if 'input_ids' not in inputs:
    model_inputs = self.tokenizer(inputs['question'], return_tensors=True, return_attention_mask=True, add_special_tokens=True, truncation=True, padding="max_length", max_length=cfg.max_length, **tokenize_kwargs)
    inputs = {**inputs, **model_inputs}

inputs.keys()


In [None]:
# generate
# https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/text_generation#transformers.GenerationConfig
# model = exllama_set_max_input_length(model, 4096)
@torch.no_grad()
def gen(model):
    s = model.generate(inputs['input_ids'][None, :], attention_mask=inputs['attention_mask'][None, :], use_cache=False, max_new_tokens=20, min_new_tokens=20, do_sample=False, early_stopping=False)
    input_l = inputs['input_ids'].shape[0]
    old = tokenizer.decode(s[0, :input_l], clean_up_tokenization_spaces=False, skip_special_tokens=False)
    new = tokenizer.decode(s[0, input_l:], clean_up_tokenization_spaces=False, skip_special_tokens=False)
    display(HTML(f"<pre>{old}</pre><b><pre>{new}</pre></b>"))


In [None]:
gen(model)


In [None]:
from baukit.nethook import Trace, TraceDict, recursive_copy
from functools import partial
from src.repe.rep_control_pipeline_baukit import intervention_meta_fn2, Activations
layers_names = [rep_control_pipeline2.layer_name_tmpl.format(i) for i in activations.keys()]   
activations_pos_i = Activations({rep_control_pipeline2.layer_name_tmpl.format(k):1. * v for k,v in activations.items()})
edit_fn_pos = partial(intervention_meta_fn2, activations=activations_pos_i)
with torch.no_grad():
    with TraceDict(
        model, layers_names, detach=True, edit_output=edit_fn_pos
    ) as ret:
        gen(model)
