# OpenPrefEval: Dead Simple Open LLM Evaluation

In [1]:
%reload_ext autoreload
%autoreload 2

## Super simple

In [2]:
from open_pref_eval import evaluate

results, _ = evaluate(model_name="gpt2", datasets=["unalignment/toxic-dpo-v0.2#train[:100]"])
results

## Hackable

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from open_pref_eval import evaluate

In [None]:
# load model using the transformer library
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# load datasets using the datasets library

N = 100 # just do a quick eval

dataset_toxic = load_dataset('unalignment/toxic-dpo-v0.2', split=f'train[:{N}]', keep_in_memory=False)

# Some datasets need to be coerced into trl's dpo format
dataset_helpsteer2 = load_dataset('Atsunori/HelpSteer2-DPO', split=f'validation[:{N}]', keep_in_memory=False).rename_column('chosen_response', 'chosen').rename_column('rejected_response', 'rejected') # training set

# We've formatted some datasets for you
from open_pref_eval.datasets.tqa import load_tqa_dpo
dataset_tqa = load_tqa_dpo(N) 


datasets = [dataset_tqa, dataset_helpsteer2, dataset_toxic]

# Don't worry datasets have simple format
dataset_tqa[0]

In [None]:
# run evaluation
df_results, df_raw = evaluate(model=model, tokenizer=tokenizer, datasets=datasets)
df_results

Everyone needs one of these radar plots ;p

In [None]:
from open_pref_eval.plot.radar import radar_plot
radar_plot(df_results['correct'])

In [None]:
# markdown table
print(df_results.round(3).to_markdown())

In [None]:
# pretty html table, color the prob column
df_results.style.format({'prob': '{:.2f}'}).bar(subset=['correct', 'prob'], color='lightblue', vmin=0, vmax=1)

## Advanced usage

In [None]:
from open_pref_eval.trainer import OPEConfig, OPETrainer, get_dummy_trainer, dummy_dataset

 When using `get_dummy_trainer` you can pass in keyword arguments, from there two parent clases

- [transformers.TrainingArguments](https://huggingface.co/docs/transformers/v4.43.3/en/main_classes/trainer#transformers.TrainingArguments)
- [trl.DPOConfig](https://huggingface.co/docs/trl/main/en/dpo_trainer#trl.DPOConfig)

In [None]:
trainer = get_dummy_trainer(model, tokenizer, per_device_eval_batch_size=2, max_length=512)

results = evaluate(datasets=datasets, trainer=trainer)
print(results)

In [None]:
# or make your own trainer
trainer = OPETrainer(
    model=model,
    ref_model=None,
    args=OPEConfig(per_device_eval_batch_size=2),
    tokenizer=tokenizer,
    train_dataset=dummy_dataset,
    eval_dataset=dummy_dataset,
)

results = evaluate(datasets=datasets, trainer=trainer)
print(results)

## What are these columns?

main:
- dataset: the dataset the example is from
- ds_i: the dataset row
- correct: rate at which the model prefered the correct answer
- prob: mean probability of how much the model prefered the correct answer

Advanced: 
- _logratio: how much the model prefered the correct answer (in log probability)
- _l_chosen: the length of the chosen string in tokens (use to check for length bias)
- _l_rejected: the length of the rejected string in tokens (use to check for length bias)
- _chosen_logps: to compare, in absolute terms, the log probability of the chosen string, this helps check for absolute coherence

For more information, try reading the [DPO Paper](https://arxiv.org/abs/2305.18290) and looking at it's [reference implementation](https://github.com/eric-mitchell/direct-preference-optimization).

In [None]:
df_raw