In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from IPython.display import display, HTML, Markdown
from torch.utils.data import random_split, DataLoader, TensorDataset
from loguru import logger
import sys
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM, AutoConfig, PreTrainedTokenizerBase, PreTrainedTokenizer, GPTQConfig, BitsAndBytesConfig

logger.remove()
logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")


In [None]:
# load my code
%load_ext autoreload
%autoreload 2
from src.eval.collect import manual_collect2
from src.eval.ds import ds2df, qc_ds, qc_dsdf
from src.prompts.prompt_loading import load_prompts, format_prompt, load_preproc_dataset
from src.models.load import load_model


In [None]:
model_ids = [
    "malhajar/phi-2-chat",
    "Yhyu13/phi-2-sft-alpaca_gpt4_en-ep1", # has benchmark!
    "venkycs/phi-2-ultrachat200k",
    "Walmart-the-bag/phi-2-uncensored",
    "Mit1208/phi-2-universal-NER", # named entity recognition
    "chendelong/phi-2-finetuned-dialogstudio",
    "TharunSiva/phi-2-oasst1-100steps", # ?
    "Yhyu13/LMCocktail-phi-2-v1", # merge

]
N = 80
res = {}
for model_id in model_ids:
    print(model_id)

    # load model
    model, tokenizer = load_model(model_id, dtype=torch.float16)

    # load dataset
    ds = load_preproc_dataset("amazon_polarity", tokenizer, N).with_format("torch")

    # eval
    dl = DataLoader(ds, batch_size=4, shuffle=False, num_workers=0)
    ds_out, f = manual_collect2(dl, model, get_residual=False)
    print(f'for {model_id}:')
    try:
        qc_ds(ds_out)
    except AssertionError as e:
        print(e)
    except:
        logger.exception(f'failed for {model_id}')

    # record overall acc
    df = ds2df(ds)
    df = df.rename(columns=lambda x: x.replace('_base', '')).copy()
    d = df.query('instructed_to_lie==False')
    acc = (d.label_instructed==d['ans']).mean()
    res[model_id] = acc
