# OpenPrefEval: Dead Simple Open LLM Evaluation

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# !pip install peft bitsandbytes -q
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from open_pref_eval.datasets import get_default_datasets
from open_pref_eval.evaluation import evaluate_models, evaluate_model, evaluate
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
datasets = get_default_datasets(100)
datasets

[Dataset({
     features: ['prompt', 'chosen', 'rejected', 'id'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 })]

In [5]:
from open_pref_eval.helpers.load_model import load_hf_or_peft_model

In [6]:

# 
from peft import AutoPeftModelForCausalLM, get_peft_model, PeftConfig, PeftModelForCausalLM
import torch
from transformers import BitsAndBytesConfig
from open_pref_eval.plot.radar import radar_plot

adapters = [
    "snake7gun/tiny-random-qwen3",
    "wassname/Qwen3-06B_dpo_overtrained",
    "markab/Qwen1.5-Capybara-0.5B-Chat", # quick no diff
    "bunnycore/SmolLM2-1.7B-lora_model",
    "Rustamshry/Qwen3-0.6B-OpenMathReason",
    # "wassname/qwen-7B-codefourchan-QLoRA"
]
# model_name = "wassname/qwen-14B-codefourchan-QLoRA"
# model_name = "alignment-handbook/zephyr-7b-sft-qlora"
# model_name = "gepardzik/LLama-3-8b-rogue-lora" # small diff

# quantization_config = BitsAndBytesConfig(
#     load_in_8bit=True,
# )

# ### Load method 1 a huggingface model with PeftMixin
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", 
#     torch_dtype=torch.bfloat16, 
#     quantization_config=quantization_config
# )
# # model.load_adapter(model_name)
# # print(model.peft_config)
# # model.delete_adapter('default')
# # model.load_adapter(model_name)
# # print(model.peft_config)


In [7]:
import numpy as np
from functools import partial
import numpy as np
import pandas as pd

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


In [9]:
from open_pref_eval import scoring

score_fns = [c for c in dir(scoring) if c.startswith('score_')]
score_fns = {c: getattr(scoring, c) for c in score_fns if not c.startswith('_')}
print(score_fns.keys())

dict_keys(['score_1st_diverg', 'score_alpha_divergence', 'score_confidence_weighted', 'score_cumulative_weighted', 'score_entropy_weighted', 'score_f_divergence', 'score_first_diverging_token', 'score_ipo', 'score_log_prob_mean', 'score_log_prob_sum', 'score_percentile', 'score_perplexity_ratio', 'score_position_weighted', 'score_power_mean', 'score_preferences'])


In [None]:
dfs = []
for adapter_name in adapters:
    model, tokenizer = load_hf_or_peft_model(
        model_name=adapter_name, 
        device_map="auto", 
        torch_dtype=torch.bfloat16, 
        quantization_config=None,
        trust_remote_code=True
    )
    results, df_raw1 = evaluate_model(
        model=model,
        tokenizer=tokenizer,
        datasets=datasets,
        batch_size=6,
        max_length=1024,
        max_prompt_length=512,
        verbose=2,
        score_fn=score_fns
    ) 
    df_raw1.fillna({'adapter': adapter_name}, inplace=True)
    dfs.append(df_raw1)
df = pd.concat(dfs, ignore_index=True)
df

  0%|          | 0/8 [00:00<?, ?dataset/s]

[32m2025-06-17 05:59:43.739[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[32m2025-06-17 05:59:44.126[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval toxic-dpo-v0.2-train[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:49.043[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:49.273[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval imdb_preferences-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:50.079[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:50.254[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval truthful_qa_preferences-validation[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:51.070[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:51.245[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval mmlu_preferences-elementary_mathematics-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:52.065[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:52.236[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval ethics_expression_preferences-commonsense-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:53.029[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:53.207[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval ethics_expression_preferences-utilitarianism-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:54.045[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:54.229[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval ethics_expression_preferences-justice-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 05:59:55.036[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

[32m2025-06-17 05:59:55.207[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval ethics_expression_preferences-deontology-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/854 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/80.8M [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?dataset/s]

[32m2025-06-17 06:00:11.772[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m190[0m - [34m[1mTokenizing dataset with in batches of 1000[0m


Tokenizing:   0%|          | 0/100 [00:00<?, ? examples/s]

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[32m2025-06-17 06:00:12.071[0m | [1mINFO    [0m | [36mopen_pref_eval.data[0m:[36mtokenize_dataset[0m:[36m222[0m - [1mTruncation rates - Prompt: 0.00%, Chosen: 0.00%, Rejected: 0.00%[0m


Eval toxic-dpo-v0.2-train[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-06-17 06:00:12.077[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.evaluation[0m:[36meval_dataset[0m:[36m205[0m - [34m[1mDetected adapters: [None, 'default'][0m


How to do this? So the logprobs are uncalibrated and kind of meaningless, but the ranking is good. So we just look at which is prefered. And take the mean

In [None]:
def rename_ds(s):
    s = s.replace('_preferences', '')
    s = s.replace('ethics_', '')
    s = s.replace('mmlu-', '')
    s = '-'.join(s.split('-')[:-1])
    return s

df_raw2a = df_raw.copy()
df_raw2a['dataset'] = df_raw2a['dataset'].apply(rename_ds)

In [None]:
cols_score = [c for c in df_raw.columns if c.startswith('score_') and (c.endswith('sigmoid') or c.endswith("correct"))]
# # df_raw[cols_score]
# df_raw2[cols_score]
cols_score

In [None]:
import numpy as np
import pandas as pd

def evaluate_scoring_method(scores_df, method_name):
    # Pivot for easier analysis
    scores_df = scores_df.copy()
    mins = scores_df[method_name].min()
    maxs = scores_df[method_name].max()
    scores_df[method_name] = scores_df[method_name].replace([np.inf, -np.inf], np.nan)
    scores = pd.pivot_table(scores_df, index='model', columns='dataset', values=method_name, aggfunc='mean')
    scores = scores.clip(lower=0, upper=1)  # Clip scores to [0, 1] range
    # print(scores)
    
    # 1. IMDB should be high (most models > 0.8)
    imdb_score = scores['imdb'].drop(index='snake7gun/tiny-random-qwen3').mean()
    
    # # 2. Hard datasets should be low (if you have a hard_math dataset)
    # hard_math_score = scores['elementary_mathematics'].mean()# if 'elementary_mathematics' in scores else 0.5
    # hard_math_penalty = 1 - abs(hard_math_score - 0.5)

    # 3. Random model should be ~0.5
    random_model = 'snake7gun/tiny-random-qwen3'  # your random model
    random_deviation = abs(scores.loc[random_model].mean() - 0.5)
    random_penalty = 1 - random_deviation  # 1 is good, 0 is bad
    
    # FIXME we want a bit of contrast in all datasets, not a lot in one
    # 4. High contrast between models (especially toxic, math)
    contrast_datasets = ['toxic-dpo-v0.2', 'imdb', 'truthful_qa', 'elementary_mathematics',
       'expression-commonsense', 'expression-utilitarianism',
       'expression-justice', 'expression-deontology' ]
    contrasts = [scores[ds].std() / scores[ds].mean().clip(0.001) for ds in contrast_datasets if ds in scores]
    avg_contrast = np.prod(contrasts) ** (1/len(contrasts)) if contrasts else 0



    # avg_contrast = scores.std() / scores.mean() if not scores.empty else 0
    
    # 5. censored vs uncensored should differ on toxic
    if 'toxic-dpo-v0.2' in scores:
        # Assuming censored models score low, uncensored high
        toxic_spread = scores['toxic-dpo-v0.2'].max() - scores['toxic-dpo-v0.2'].min()
    else:
        toxic_spread = 0
    
    # Combined score
    quality = (
        imdb_score * 2 +              # weight easy dataset performance
        random_penalty * 3 +          # important: random = 0.5
        avg_contrast * 2 +            # discrimination power
        toxic_spread                  # specific contrast we expect
        # hard_math_penalty               # weight hard dataset performance
    ) / 9  # normalize to [0, 1]

    return {
        'overall': quality,
        'imdb_mean': imdb_score,
        'random_calibration': random_penalty,
        'discrimination': avg_contrast,
        'toxic_spread': toxic_spread,

        # 'hard_math': hard_math_score,
        'min': mins,
        'max': maxs,
        'nan': scores_df[method_name].isna().sum(),
        'inf': scores_df[method_name].isin([np.inf, -np.inf]).sum(),

    }


# cols = [c for c in df2.columns if 'score' in c]
# res = {}
# for c in cols:
#     v = evaluate_scoring_method(df2, c)
#     res[c] = v

#     # df2['correct2'] = df2[c]>0.5
#     # v = evaluate_scoring_method(df2, 'correct2')
#     # res[f'{c}_bool'] = v


# res = pd.DataFrame(res).T.sort_values('overall', ascending=False)
# res

In [None]:
df_raw[cols_score].describe()

In [None]:
# scores = pd.pivot_table(df_raw, index='adapter', columns='dataset', values=cols_score, aggfunc='mean')
# scores.describe()

In [None]:

# df_raw_bool = df_raw.copy()
# df_raw_bool[cols_score] = df_raw_bool[cols_score] > 0.5 # it's in log, so which is prefered
# df_raw_bool['dataset'] = df_raw_bool['dataset'].apply(rename_ds)

df_raw2 = df_raw.copy()
cols_score = [c for c in df_raw2.columns if c.startswith('score_')]#+['correct']
# df_raw2[cols_score] = df_raw2[cols_score] > 0.5 # it's in log, so which is prefered
df_raw2['dataset'] = df_raw2['dataset'].apply(rename_ds)    


res = {}
for c in cols_score:
    v2 = evaluate_scoring_method(df_raw2, c)
    # print(f"{v2['overall']:.2f} {c}")
    res[c] = v2



In [None]:
# OK what's happening here seems to be that a lot, of even the IMBD ones are cropped out, that's is not right!

In [None]:
x = df_raw2.query('dataset == "imdb"')[cols_score+['adapter']]#[['adapter', 'score_ipo__sigmoid']]
x = x.set_index('adapter').sort_values('score_score_ipo__sigmoid', ascending=False)
x = x > 0.5
x.groupby('adapter').mean()#.sort_values('score_ipo__sigmoid', ascending=False)

In [None]:
res2 = pd.DataFrame(res).T.sort_values('discrimination', ascending=False)

# pd.set_option("display.max_rows", None)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(res2[['overall', 'max']])

In [None]:
res2.T[[c for c in res2.index if 'rank' in c]].T.sort_values('overall', ascending=False)

In [None]:

ref = 'score_score_ipo__correct'

res2 = res2[res2['max'] <= 1.0]
res2 = res2[res2['min'] >= 0.0]

res2 = res2[res2['nan'] == 0]

res2 = res2[res2['imdb_mean'] >= res2.loc[ref, 'imdb_mean'] * 0.9]
res2 = res2[res2['random_calibration'] >= res2.loc[ref, 'random_calibration'] * 0.9]
res2 = res2[res2['discrimination'] >= res2.loc[ref, 'discrimination'] * 0.5]
res2 = res2[res2['toxic_spread'] >= res2.loc[ref, 'toxic_spread'] * 0.5]
res2

In [None]:
cols2plot = res2.index

In [None]:
# what if we apply sigmoid before mean?

df_raw2 = df_raw2a.copy()
# df_raw2[cols_score] = df_raw2[cols_score] > 0.5 # it's in log, so which is prefered
# df_raw2[cols_score] = df_raw2[cols_score].apply(sigmoid)

for c in cols2plot:
    df_agg =  df_raw2.groupby(['dataset', 'adapter'], dropna=False)[c].mean().unstack()
    print(c)
    radar_plot(df_agg)
    plt.title(c)
    plt.show()
    display(df_agg)
    