In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from open_pref_eval import evaluate

In [3]:
# load model using the transformer library
# model_name = "microsoft/Phi-3-mini-4k-instruct"
# model_name = "trl-internal-testing/dummy-GPT2-correct-vocab"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:
# load datasets using the datasets library

N = 100 # just do a quick eval

dataset_toxic = load_dataset('unalignment/toxic-dpo-v0.2', split=f'train[:{N}]', keep_in_memory=False)

# Some datasets need to be coerced into trl's dpo format
dataset_helpsteer2 = load_dataset('Atsunori/HelpSteer2-DPO', split=f'validation[:{N}]', keep_in_memory=False).rename_column('chosen_response', 'chosen').rename_column('rejected_response', 'rejected') # training set

# We've formatted some datasets for you
from open_pref_eval.datasets.tqa import load_tqa_dpo
dataset_tqa = load_tqa_dpo(N) 

# Don't worry datasets have simple format
dataset_tqa[0]

In [None]:
df_results, df_raw = evaluate(model, tokenizer, datasets=[dataset_tqa, dataset_helpsteer2, dataset_toxic])
# df_results

In [None]:
# radar plot

In [None]:
# markdown table
df.to_markdown()

In [None]:
# pretty html table

## Advanced usage

In [None]:
from open_pref_eval.trainer import OPEConfig, OPETrainer

In [None]:
from open_pref_eval.trainer import dummy_dataset, get_dummy_trainer

trainer = get_dummy_trainer(model, tokenizer, per_device_eval_batch_size=2, max_length=512)
trainer

In [None]:
trainer.args.to_dict()

In [None]:
results = evaluate(datasets=['Atsunori/HelpSteer2-DPO', "trufullqa", "toxic"], trainer=trainer)
print(results)

## What are these columns?

main:
- dataset: the dataset the example is from
- ds_i: the dataset row
- correct: whether the model prefered the correct answer
- prob: how much the model prefered the correct answer

Advanced:
- _logratio: how much the model prefered the correct answer (in log probability)
- _l_chosen: the length of the chosen string in tokens (use to check for length bias)
- _l_rejected: the length of the rejected string in tokens (use to check for length bias)
- _chosen_logps: to compare, in absolute terms, the log probability of the chosen string, this helps check for absolute coherence

In [None]:
df_raw