In [1]:
import os

os.environ["TQDM_DISABLE"] = "1"


Quick notebook to check model accuracy

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from IPython.display import display, HTML, Markdown
from torch.utils.data import random_split, DataLoader, TensorDataset
from loguru import logger
import sys
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoConfig, PreTrainedTokenizerBase, PreTrainedTokenizer, GPTQConfig, BitsAndBytesConfig

logger.remove()
logger.add(sys.stderr, format="{time} {level} {message}", level="WARNING")


  from .autonotebook import tqdm as notebook_tqdm


1

In [3]:
# load my code
%load_ext autoreload
%autoreload 2


In [4]:
from src.eval.collect import manual_collect2
from src.eval.ds import ds2df, qc_ds, qc_dsdf
from src.prompts.prompt_loading import load_prompts, format_prompt, load_preproc_dataset, load_preproc_datasets
from src.models.load import load_model
from src.models.phi.model_phi import PhiForCausalLMWHS


In [5]:
model_ids = [
        ["Walmart-the-bag/phi-2-uncensored", 'phi'],
    # ["malhajar/phi-2-chat", 'phi'],
    ["Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1", 'phi'], # has benchmark!
    # ["chendelong/phi-2-finetuned-dialogstudio", 'phi'],
    # # ["Mit1208/phi-2-universal-NER", 'phi'], # named entity recognition no config.json
    # ["TharunSiva/phi-2-oasst1-100steps", 'phi'], # ?
    # ["Yhyu13/LMCocktail-phi-2-v1", 'phi'], # merge
    [ "wassname/phi-2-w_hidden_states", 'phi'], # merge
    [ "wassname/phi-1_5-w_hidden_states", 'phi'], # merge

    # "venkycs/phi-2-ultrachat200k", # broken


]
N = 280
res = {}
datasets = ["super_glue:boolq", "imdb" , "glue:qnli", "amazon_polarity"]
for model_id, prompt_format in model_ids:
    try:
        print(model_id)

        # load model
        model, tokenizer = load_model(model_id, dtype=torch.float16, model_class=PhiForCausalLMWHS)

        # load dataset
        ds_tokens = load_preproc_datasets(datasets,
                            tokenizer,
                N=N,
                prompt_format=prompt_format,
        )
        # ds_tokens
        # ds = load_preproc_dataset("super_glue:boolq", tokenizer, N, prompt_format=prompt_format).with_format("torch")
        # print(ds['question'][0])


        # eval
        dl = DataLoader(ds_tokens, batch_size=6, shuffle=False, num_workers=0)
        ds_out, f = manual_collect2(dl, model, get_residual=False)
        print(f'for {model_id}:')
        try:
            qc_ds(ds_out)
        except AssertionError as e:
            print(e)
        except:
            logger.exception(f'failed for {model_id}')

        # record overall acc
        df = ds2df(ds_out)
        df = df.rename(columns=lambda x: x.replace('_base', '')).copy()
        d = df.query('instructed_to_lie==False')
        acc = (d.label_instructed==d['ans']).mean()
        res[model_id] = acc
    except KeyboardInterrupt:
        raise
    except:
        logger.exception(f'failed for {model_id}')


Walmart-the-bag/phi-2-uncensored


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  table = cls._concat_blocks(blocks, axis=0)
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


for Walmart-the-bag/phi-2-uncensored:
with base model
	balance=	51.41% [N=284]
	acc    =	73.05% [N=141]      - when the model is not lying... we get this task acc
	lie_acc=	35.66% [N=143]      - when the model tries to lie... we get this acc
	known_lie_acc=	25.71% [N=70]      - when the model tries to lie and knows the answer... we get this acc
	choice_cov=	87.37%             - Our choices accounted for a mean probability of this
Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


for Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1:
with base model
	balance=	51.41% [N=284]
	acc    =	78.72% [N=141]      - when the model is not lying... we get this task acc
	lie_acc=	27.97% [N=143]      - when the model tries to lie... we get this acc
	known_lie_acc=	27.06% [N=85]      - when the model tries to lie and knows the answer... we get this acc
	choice_cov=	86.58%             - Our choices accounted for a mean probability of this
wassname/phi-2-w_hidden_states


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


for wassname/phi-2-w_hidden_states:
with base model
	balance=	51.41% [N=284]
	acc    =	76.60% [N=141]      - when the model is not lying... we get this task acc
	lie_acc=	28.67% [N=143]      - when the model tries to lie... we get this acc
	known_lie_acc=	26.58% [N=79]      - when the model tries to lie and knows the answer... we get this acc
	choice_cov=	83.37%             - Our choices accounted for a mean probability of this
wassname/phi-1_5-w_hidden_states


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


for wassname/phi-1_5-w_hidden_states:
with base model
	balance=	51.41% [N=284]
	acc    =	63.12% [N=141]      - when the model is not lying... we get this task acc
	lie_acc=	39.16% [N=143]      - when the model tries to lie... we get this acc
	known_lie_acc=	36.07% [N=61]      - when the model tries to lie and knows the answer... we get this acc
	choice_cov=	75.41%             - Our choices accounted for a mean probability of this


In [9]:
res


{'Walmart-the-bag/phi-2-uncensored': 0.7304964539007093,
 'Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1': 0.7872340425531915,
 'wassname/phi-2-w_hidden_states': 0.7659574468085106,
 'wassname/phi-1_5-w_hidden_states': 0.6312056737588653}

In [6]:
qc_dsdf(d)


	balance=	51.77% [N=141]
	acc    =	63.12% [N=141]      - when the model is not lying... we get this task acc
	choice_cov=	72.35%             - Our choices accounted for a mean probability of this


{'balance': 0.5177304964539007,
 'N': 141,
 'acc': 0.6312056737588653,
 'tchoice_cov': 0.72353953}

In [7]:
print('acc by dataset and template name')
df1 = ds2df(ds_out)
df_b = df1.rename(columns=lambda x: x.replace('_base', '')).copy()
# df_a = df1.rename(columns=lambda x: x.replace('_adapt', '')).copy()
for ds_string, ddf in df_b.groupby(['ds_string', 'template_name']):
    print(ds_string)
    qc_dsdf(ddf)


acc by dataset and template name
('amazon_polarity', 'Is_this_product_review_positive')
	balance=	85.71% [N=7]
	acc    =	50.00% [N=2]      - when the model is not lying... we get this task acc
	lie_acc=	60.00% [N=5]      - when the model tries to lie... we get this acc
	choice_cov=	77.44%             - Our choices accounted for a mean probability of this
('amazon_polarity', 'Is_this_review')
	balance=	50.00% [N=6]
	acc    =	100.00% [N=2]      - when the model is not lying... we get this task acc
	lie_acc=	50.00% [N=4]      - when the model tries to lie... we get this acc
	choice_cov=	98.53%             - Our choices accounted for a mean probability of this
('amazon_polarity', 'Is_this_review_negative')
	balance=	16.67% [N=6]
	acc    =	33.33% [N=3]      - when the model is not lying... we get this task acc
	lie_acc=	66.67% [N=3]      - when the model tries to lie... we get this acc
	choice_cov=	89.24%             - Our choices accounted for a mean probability of this
('amazon_polarity',

In [8]:
df_b.groupby(['ds_string', 'template_name']).apply(lambda d:qc_dsdf(d)['acc'])


	balance=	85.71% [N=7]
	acc    =	50.00% [N=2]      - when the model is not lying... we get this task acc
	lie_acc=	60.00% [N=5]      - when the model tries to lie... we get this acc
	choice_cov=	77.44%             - Our choices accounted for a mean probability of this
	balance=	50.00% [N=6]
	acc    =	100.00% [N=2]      - when the model is not lying... we get this task acc
	lie_acc=	50.00% [N=4]      - when the model tries to lie... we get this acc
	choice_cov=	98.53%             - Our choices accounted for a mean probability of this
	balance=	16.67% [N=6]
	acc    =	33.33% [N=3]      - when the model is not lying... we get this task acc
	lie_acc=	66.67% [N=3]      - when the model tries to lie... we get this acc
	choice_cov=	89.24%             - Our choices accounted for a mean probability of this
	balance=	85.71% [N=7]
	acc    =	66.67% [N=6]      - when the model is not lying... we get this task acc
	lie_acc=	0.00% [N=1]      - when the model tries to lie... we get this acc
	choice_cov

KeyError: 'acc'

In [None]:
model_id = "Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1"
model, tokenizer = load_model(model_id, dtype=torch.float16)

print(f'for {model_id}:')
for prompt_format in ['phi', 'alpaca', 'chatml']:
    print(prompt_format)
    ds = load_preproc_dataset("amazon_polarity", tokenizer, 50, prompt_format=prompt_format).with_format("torch")
    
    dl = DataLoader(ds, batch_size=4, shuffle=False, num_workers=0)
    ds_out, f = manual_collect2(dl, model, get_residual=False)
    qc_ds(ds_out)

