Quick notebook to check model accuracy

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from IPython.display import display, HTML, Markdown
from torch.utils.data import random_split, DataLoader, TensorDataset
from loguru import logger
import sys
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoConfig, PreTrainedTokenizerBase, PreTrainedTokenizer, GPTQConfig, BitsAndBytesConfig

logger.remove()
logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")


  from .autonotebook import tqdm as notebook_tqdm


1

In [2]:
# load my code
%load_ext autoreload
%autoreload 2


In [3]:
from src.eval.collect import manual_collect2
from src.eval.ds import ds2df, qc_ds, qc_dsdf
from src.prompts.prompt_loading import load_prompts, format_prompt, load_preproc_dataset
from src.models.load import load_model


In [4]:
model_ids = [
    "malhajar/phi-2-chat",
    "Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1", # has benchmark!
    "venkycs/phi-2-ultrachat200k",
    "Walmart-the-bag/phi-2-uncensored",
    "Mit1208/phi-2-universal-NER", # named entity recognition
    "chendelong/phi-2-finetuned-dialogstudio",
    "TharunSiva/phi-2-oasst1-100steps", # ?
    "Yhyu13/LMCocktail-phi-2-v1", # merge

]
N = 80
res = {}
for model_id in model_ids:
    print(model_id)

    # load model
    model, tokenizer = load_model(model_id, dtype=torch.float16)

    # load dataset
    ds = load_preproc_dataset("amazon_polarity", tokenizer, N).with_format("torch")

    # eval
    dl = DataLoader(ds, batch_size=4, shuffle=False, num_workers=0)
    ds_out, f = manual_collect2(dl, model, get_residual=False)
    print(f'for {model_id}:')
    try:
        qc_ds(ds_out)
    except AssertionError as e:
        print(e)
    except:
        logger.exception(f'failed for {model_id}')

    # record overall acc
    df = ds2df(ds)
    df = df.rename(columns=lambda x: x.replace('_base', '')).copy()
    d = df.query('instructed_to_lie==False')
    acc = (d.label_instructed==d['ans']).mean()
    res[model_id] = acc



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.38s/it]
  table = cls._concat_blocks(blocks, axis=0)
2023-12-22T09:52:15.018878+0800 INFO Extracting 11 variants of each prompt
Generating train split: 242 examples [00:41,  5.78 examples/s]
format_prompt: 100%|██████████| 242/242 [00:00<00:00, 7983.12 examples/s]
tokenize: 100%|██████████| 242/242 [00:00<00:00, 2793.65 examples/s]
truncated: 100%|██████████| 242/242 [00:00<00:00, 2179.98 examples/s]
truncated: 100%|██████████| 242/242 [00:00<00:00, 2561.22 examples/s]
prompt_truncated: 100%|██████████| 242/242 [00:00<00:00, 371.00 examples/s]
choice_ids: 100%|██████████| 242/242 [00:00<00:00, 7571.74 examples/s]
2023-12-22T09:52:55.197613+0800 INFO median token length: 358.0 for amazon_polarity. max_length=999
2023-12-22T09:52:55.198160+0800 INFO truncation rate: 0.00% on amazon_polarity
Filter: 100%|████

	for malhajar/phi-2-chat:
	with base model
		balance=	45.71% [N=35]
		acc    =	47.06% [N=17]      - when the model is not lying... we get this task acc
		lie_acc=	61.11% [N=18]      - when the model tries to lie... we get this acc
		known_lie_acc=	100.00% [N=2]      - when the model tries to lie and knows the answer... we get this acc
		choice_cov=	0.50%             - Our choices accounted for a mean probability of this

In [None]:
res
