# Lets save our data as a huggingface dataset, so it's quick to reuse



In [1]:
# import your package
%load_ext autoreload
%autoreload 2

from loguru import logger
import sys
logger.remove()
logger.add(sys.stderr, format="<level>{message}</level>", level="INFO")

import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

In [2]:
import numpy as np


from typing import Optional, List, Dict, Union

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

import pickle
import hashlib
from pathlib import Path

import transformers
from datasets import Dataset, DatasetInfo, load_from_disk, load_dataset, IterableDataset


from tqdm.auto import tqdm
import os, re, sys, collections, functools, itertools, json

transformers.__version__


'4.31.0'

In [3]:
from src.models.load import load_model
from src.datasets.load import ds2df
from src.datasets.load import rows_item
from src.datasets.batch import batch_hidden_states
# from src.datasets.scores import choice2ids, scores2choice_probs


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/ubuntu/mambaforge/envs/dlk3/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


# Params

In [4]:
# Params
BATCH_SIZE = 1  # None # None means auto # 6 gives 16Gb/25GB. where 10GB is the base model. so 6 is 6/15
USE_MCDROPOUT = True

from src.extraction.config import ExtractConfig


cfg = ExtractConfig(
    # model="HuggingFaceH4/starchat-beta",
    # model="TheBloke/CodeLlama-13B-Instruct-fp16", # too large!
    model="WizardLM/WizardCoder-3B-V1.0",
    # model="WizardLM/WizardCoder-1B-V1.0",
    # model="WizardLM/WizardCoder-Python-7B-V1.0", # too large!
    
    ## see https://github.com/EleutherAI/elk/tree/1b60b3bff348b00356cd15b5eb017f9c9bfdbae1/elk/promptsource/templates
    datasets = (
        # "imdb", # sentiment
        # "amazon_polarity", # sentiment
        # "super_glue:boolq", # reading comprehension
        # 'tweet_eval:irony', # irony
        # 'great_code', # code
        'qasc', # Question Answering via Sentence Composition (QASC) # dataset has no label column
        
        ## Datasets with problems
        # 'lauritowal/redefine_math', # dataset has no label column
        # 'crows_pairs', # sterotypes FAIL need to specify label columns
        # 'hate_speech18', # weird errors
        # 'medical_questions_pairs', # medical paraphrase        
        # 'poem_sentiment' # no only boolean for now
        # 'reaganjlee/truthful_qa_mc', # no only bool
    ),
    max_examples=(250, 31),
    num_shots=1,
)
cfg

ExtractConfig(model='WizardLM/WizardCoder-3B-V1.0', datasets=('qasc',), data_dirs=(), int4=True, max_examples=(250, 31), num_shots=1, num_variants=-1, layers=(), seed=42, token_loc='last', template_path=None)

# Model

Chosing:
- https://old.reddit.com/r/LocalLLaMA/wiki/models
- https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard
- https://github.com/deep-diver/LLM-As-Chatbot/blob/main/model_cards.json


A uncensored and large coding ones might be best for lying.

In [5]:
from src.models.load import verbose_change_param, AutoConfig, AutoTokenizer, AutoModelForCausalLM

def load_model(model_repo = "HuggingFaceH4/starchat-beta"):
    # see https://github.com/deep-diver/LLM-As-Chatbot/blob/main/models/starchat.py
    model_options = dict(
        device_map="auto",
        # load_in_8bit=True,
        # load_in_4bit=True,
        torch_dtype=torch.float16, # note because datasets pickles the model into numpy to get the unique datasets name, and because numpy doesn't support bfloat16, we need to use float16
        # use_safetensors=False,
    )

    config = AutoConfig.from_pretrained(model_repo, use_cache=False)
    verbose_change_param(config, 'use_cache', False)
    
    tokenizer = AutoTokenizer.from_pretrained(model_repo)
    verbose_change_param(tokenizer, 'pad_token_id', 0)
    verbose_change_param(tokenizer, 'padding_side', 'left')
    verbose_change_param(tokenizer, 'truncation_side', 'left')
    
    model = AutoModelForCausalLM.from_pretrained(model_repo, config=config, **model_options)

    return model, tokenizer



# Load Dataset

/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/imdb ../src/prompts/templates/imdb
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/amazon_polarity ../src/prompts/templates/amazon_polarity
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/super_glue ../src/prompts/templates/super_glue
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/tweet_eval ../src/prompts/templates/tweet_eval
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/great_code ../src/prompts/templates/great_code
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/qasc ../src/prompts/templates/qasc
/home/ubuntu/mambaforge/envs/dlk3/lib/python3.11/site-packages/elk/promptsource/templates/lauritowal/redefine_math ../src/prompts/templates/lauritowal/redefine_math
/home/ubuntu/mambaforge/env

In [6]:
from itertools import chain
import functools
from src.prompts.prompt_loading import load_prompts
        
# loop through all prompts in this dataset
ds_names = cfg.datasets
split_type = "train"

ds_name = ds_names[0]
N = cfg.max_examples[split_type!="train"]
dataset = Dataset.from_generator(
    load_prompts, 
    gen_kwargs=dict(
        ds_string=ds_name, 
        num_shots=cfg.num_shots,
        split_type=split_type,
        template_path=cfg.template_path,
        seed=cfg.seed,
        prompt_format='llama',
        N=N,
    ), 
    )

dataset

Generating train split: 0 examples [00:00, ? examples/s]

Downloading builder script:   0%|          | 0.00/5.12k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8134 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/920 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/926 [00:00<?, ? examples/s]

Extracting 8 variants of each prompt


DatasetGenerationError: An error occurred while generating the dataset

In [None]:
b = next(iter(dataset))
b

In [None]:
model, tokenizer = load_model(cfg.model)

## Format prompts

The prompt is the thing we most often have to change and debug. So we do it explicitly here.

We do it as transforms on a huggingface dataset.

In this case we use multishot examples from train, and use the test set to generated the hidden states dataset. We will test generalisation on a whole new dataset.


In [None]:
from src.datasets.scores import scores2choice_probs
from src.datasets.scores import choice2id, choice2ids

def row_choice_ids(r):
    return choice2ids([[c] for c in r['answer_choices']], tokenizer)


In [None]:
ds = (
    dataset
    .map(
        lambda ex: tokenizer(
            ex["question"], padding="max_length", max_length=600, truncation=True, add_special_tokens=True,
            # return_tensors="pt",
            return_attention_mask=True,
        ),
        batched=True,
    )
    .map(
        lambda r: {"prompt_truncated": tokenizer.batch_decode(r["input_ids"])},
        batched=True,
    )
    .map(lambda r: {'choice_ids': row_choice_ids(r)})
)
ds

In [None]:
# %debug

## Save as Huggingface Dataset

In [None]:
# get dataset filename
sanitize = lambda s:s.replace('/', '').replace('-', '_') if s is not None else s

dataset_name = f"{sanitize(cfg.model)}_{ds_name}_{split_type}_{N}"
dataset_name

f = f"../.ds/{dataset_name}"
print(f)

In [None]:
gen_kwargs = dict(
    model=model,
    tokenizer=tokenizer,
    data=ds,
    batch_size=BATCH_SIZE,
)
gen_kwargs

In [None]:
# ds['choice_ids']
l = model.transformer.h[10]
l.attn.c_attn


In [None]:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py

In [None]:
info_kwargs = dict(extract_cfg=cfg, ds_name=ds_name, split_type=split_type, f=f)

model.cuda()

In [None]:
# # test, debug
# g = batch_hidden_states(**gen_kwargs)
# bb = next(iter(g))
# print({k:bb[k].shape for k in bb['large_arrays_keys']})
# print({k:bb[k].dtype for k in bb['large_arrays_keys']})

In [None]:
# from src.helpers.typing import float_to_int16, int16_to_float
# import torch
# x = torch.rand(4, 5, dtype=torch.float)
# x2 = float_to_int16(x)
# x3 = int16_to_float(x2)
# x3-x
# # x.type(torch.float)-x

[DatasetInfo](https://github.com/huggingface/datasets/blob/9b21e181b642bd55b3ef68c1948bfbcd388136d6/src/datasets/info.py#L94)


In [None]:
ds1 = Dataset.from_generator(
    generator=batch_hidden_states,
    info=DatasetInfo(
        name=dataset_name,
        description=json.dumps(info_kwargs, indent=2),)",
        config_name=f,
        citation="",
        homepage="",
        version="",
        
        
    ),
    gen_kwargs=gen_kwargs,
    num_proc=1,
    
)#.with_format("numpy")
# ds1

## Add labels

For our probe. Given next_token scores (logits) we take only the subset the corresponds to our negative tokens (e.g. False, no, ...) and positive tokens (e.g. Yes, yes, affirmative, ...).


In [None]:
# from src.datasets.scores import choice2id, choice2ids
# ds1['hidden_states']

In [None]:
# def expand_choices(choices: List[str]) -> List[str]:
#     """expand out choices by adding versions that are upper, lower, whitespace, etc"""
#     new = []
#     for c in choices:
#         new.append(c)
#         new.append(c.upper())
#         new.append(c.capitalize())
#         new.append(c.lower())
#     return set(new)


# left_choices = list(r[0] for r in ds1['answer_choices'])+['no', 'false', 'negative', 'wrong']
# right_choices = list(r[1] for r in ds1['answer_choices'])+['yes', 'true', 'positive', 'right']
# left_choices, right_choices = expand_choices(left_choices), expand_choices(right_choices)
# expanded_choices = [left_choices, right_choices]
# expanded_choice_ids = choice2ids(expanded_choices, tokenizer)
# expanded_choices

In [None]:
ds1

In [None]:
# this is just based on pairs for that answer...
add_txt_ans0 = lambda r: {'txt_ans0': tokenizer.decode(r['scores0'].argmax(-1))}
# add_txt_ans1 = lambda r: {'txt_ans1': tokenizer.decode(r['scores1'].argmax(-1))}

def row_choice_ids(r):
    return choice2ids([[c] for c in r['answer_choices']], tokenizer)

# Either just use the template choices
add_ans = lambda r: scores2choice_probs(r, row_choice_ids(r), keys=["scores0"])

# Or all expanded choices
# add_ans_exp = lambda r: scores2choice_probs(r, expanded_choice_ids, prefix="expanded_")
ds1.set_format(type='numpy')#, columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
ds3 = (
    ds1
    .map(add_ans)
    # .map(add_ans_exp)
    .map(add_txt_ans0)
    # .map(add_txt_ans1)
)
ds3

In [None]:
ds3.config_name

## Save to disk

In [None]:
ds3.save_to_disk(f)
f

# QC

In [None]:
from src.datasets.load import load_ds
ds4 = load_ds(f)
ds4

In [None]:
# [v for k,v in ds4[0].items() if isinstance(v, (np.ndarray, np.generic, torch.Tensor))]
for k,v in ds4[0].items():
    print(k, v.shape, v.dtype)
    if (isinstance(v, (np.ndarray, np.generic, torch.Tensor)) and (v.dtype in ['float16', 'float32', 'float64', 'int64', 'int32', 'int16', 'int8'])):
        assert np.isfinite(v).all()

In [None]:

# QC, check which answers are most common
common_answers = pd.Series(ds4['txt_ans0']).value_counts()
display('Remember it should be binary. Found common LLM answers:', common_answers)

current_choices = set(list(chain(*ds4['answer_choices'])))
unexpected_answers = set(common_answers.head(10).index)-current_choices
if len(unexpected_answers):
    logger.warning(f'found unexpected answers: {unexpected_answers}. You may want to add them to class2choices')
    
mean_prob = ds4['choice_probs0'].sum(-1).mean()
print('mean_prob', mean_prob)
assert ds4['choice_probs0'].sum(-1).mean()>0.2, f"""
Our choices should cover most common answers. But they accounted for a mean probability of {mean_prob:2.2%} (should be >40%). 

To fix this you might want to improve your prompt or add to your choices
"""

In [None]:
df = ds2df(ds4)
df.head(5)

In [None]:
# QC check accuracy
# it should manage to lie some of the time when asked to lie. Many models wont lie unless very explicitly asked to, but we don't want to do that, we want to leave some ambiguity in the prompt

d = df.query('instructed_to_lie==True')
acc = (d.label_instructed==d.llm_ans).mean()
print(f"when the model tries to lie... we get this acc {acc:2.2f}")
assert acc>0.1, f"should be acc>0.1 but is acc={acc}"

### QC stats

In [None]:
def stats(df):
    return dict(
        acc=(df.llm_ans == df.label_instructed).mean(),
        n=len(df),
    )
    
def col2statsdf(df, group):
    return pd.DataFrame(df.groupby(group).apply(stats).to_dict()).T
    
    
print("how well does it do the simple task of telling the truth, for each template")
col2statsdf(df.query('sys_instr_name=="truth"'), 'template_name')

In [None]:
print("how well does it complete the task for each prompt")
# of course getting it to tell the truth is easy, but how effective are the other prompts?
col2statsdf(df, 'sys_instr_name')

### QC view row

In [None]:
# QC by viewing a row
r = ds4[0]
print(r['prompt_truncated'])
print(r['txt_ans0'])

# QC: generation

Let's a quick generation, so we can QC the output and sanity check that the model can actually do the task

In [None]:
# r = ds[2]
# q = r["prompt_truncated"]

# pipeline = transformers.pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
# )
# sequences = pipeline(
#     q.lstrip('<|endoftext|>'),
##     max_length=100,
# max_new_tokens=10,
#     do_sample=False,
#     return_full_text=False,
#     eos_token_id=tokenizer.eos_token_id,
# )

# for seq in sequences:
#     print("-" * 80)
#     print(q)
#     print("-" * 80)
#     print(f"`{seq['generated_text']}`")
#     print("-" * 80)
#     print("label", r['label'])


# QC: linear probe

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [None]:
# # just select the question where the model knows the answer. 
df = ds2df(ds4)
d = df.query('sys_instr_name=="truth"').set_index("example_i")

# # these are the ones where it got it right when asked to tell the truth
m1 = d.llm_ans==d.label_true
known_indices = d[m1].index
print(f"select rows are {m1.mean():2.2%} based on knowledge")
# # convert to row numbers, and use datasets to select
known_rows = df['example_i'].isin(known_indices)
known_rows_i = df[known_rows].index

# # also restrict it to significant permutations. That is monte carlo dropout pairs, where the answer changes by more than X%
# m = np.abs(df.ans0-df.ans1)>0.05
# print(f"selected rows are {m.mean():2.2%} for significance")
# significant_rows = m[m].index

# allowed_rows_i = set(known_rows_i).intersection(significant_rows)
# allowed_rows_i = significant_rows
ds5 = ds4.select(known_rows_i)
df = ds2df(ds5)

In [None]:
# [v for k,v in ds4[0].items()]
# ds4[0]['hidden_states'].dtype

In [None]:
large_arrays_keys = [k for k,v in ds4[0].items() if v.ndim>1]
large_arrays_keys

In [None]:
for k in large_arrays_keys:
    print('-'*80)
    print(k)
    hs = ds5[k]
    X = hs.reshape(hs.shape[0], -1)


    y = df['label_true'] == df['llm_ans']

    # split
    n = len(y)
    max_rows = 1000
    
    X_train, X_test = X[:n//2], X[n//2:]
    y_train, y_test = y[:n//2], y[n//2:]
    X_train = X_train[:max_rows]
    y_train = y_train[:max_rows]
    X_test = X_test[:max_rows]
    y_test = y_test[:max_rows]
    print('split size', X_train.shape, y_test.shape)

    # scale
    scaler = RobustScaler()
    scaler.fit(X_train)
    X_train2 = scaler.transform(X_train)
    X_test2 = scaler.transform(X_test)

    lr = LogisticRegression(class_weight="balanced", penalty="l2", max_iter=380)
    lr.fit(X_train2, y_train>0)

    print("Logistic cls acc: {: 3.2%} [TRAIN]".format(lr.score(X_train2, y_train>0)))
    print("Logistic cls acc: {: 3.2%} [TEST]".format(lr.score(X_test2, y_test>0)))

# Scratch

In [None]:
# QC: make sure we didn't lose all of the successful lies, which would make the problem trivial
df2= ds2df(ds5)
df_subset_successull_lies = df2.query("instructed_to_lie==True & (llm_ans==label_instructed)")
print(f"filtered to {len(df_subset_successull_lies)} num successful lies out of {len(df2)} dataset rows")
assert len(df_subset_successull_lies)>0, "there should be successful lies in the dataset"