# OpenPrefEval: Dead Simple Open LLM Evaluation

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
# !pip install peft bitsandbytes -q
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from open_pref_eval.datasets import get_default_datasets
from open_pref_eval.evaluation import evaluate_models, evaluate_model, evaluate
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
datasets = get_default_datasets(100)
datasets

[Dataset({
     features: ['prompt', 'chosen', 'rejected', 'id'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 100
 })]

In [4]:

# 
from peft import AutoPeftModelForCausalLM, get_peft_model, PeftConfig, PeftModelForCausalLM
import torch
from transformers import BitsAndBytesConfig
from open_pref_eval.plot.radar import radar_plot
model_name = "markab/Qwen1.5-Capybara-0.5B-Chat" # quick no diff
model_name = "Rustamshry/Qwen3-0.6B-OpenMathReason"


model_name = "wassname/qwen-7B-codefourchan-QLoRA"
# model_name = "wassname/qwen-14B-codefourchan-QLoRA"

# model_name = "alignment-handbook/zephyr-7b-sft-qlora"
# model_name = "gepardzik/LLama-3-8b-rogue-lora" # small diff

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

### Load method 1 a huggingface model with PeftMixin
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", 
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config
)
# model.load_adapter(model_name)
# print(model.peft_config)
# model.delete_adapter('default')
# model.load_adapter(model_name)
# print(model.peft_config)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:

# # ## Load method 2 a PeftModel
# model = PeftModelForCausalLM.from_pretrained(
#     model,
#     model_name,
#     device_map="auto",
#     torch_dtype=torch.bfloat16,
#     quantization_config=quantization_config,
# )
# print(model.peft_config)

In [6]:

tokenizer = AutoTokenizer.from_pretrained(model.peft_config['default'].base_model_name_or_path)

In [7]:
# model.delete_adapter('default')
# model.load_adapter(model_name, '4chan')

In [8]:
print(model.peft_config)

{'default': LoraConfig(task_type='CAUSAL_LM', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='unsloth/Qwen2.5-Coder-7B-Instruct', revision=None, inference_mode=True, r=64, target_modules={'v_proj', 'gate_proj', 'down_proj', 'up_proj', 'k_proj', 'o_proj', 'q_proj'}, exclude_modules=None, lora_alpha=64, lora_dropout=0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)}


In [9]:
import numpy as np
from functools import partial
import numpy as np
import pandas as pd

def sigmoid(x):
    return 1 / (1 + np.exp(-x))


In [10]:
from open_pref_eval import scoring

score_fns = [c for c in dir(scoring) if c.startswith('score_')]
score_fns = {c: getattr(scoring, c) for c in score_fns if not c.startswith('_')}
print(score_fns.keys())

dict_keys(['score_1st_diverg', 'score_alpha_divergence', 'score_confidence_weighted', 'score_cumulative_weighted', 'score_entropy_weighted', 'score_f_divergence', 'score_first_diverging_token', 'score_information_weighted', 'score_ipo', 'score_log_prob_mean', 'score_log_prob_sum', 'score_percentile', 'score_perplexity_ratio', 'score_position_weighted', 'score_power_mean', 'score_precision_weighted', 'score_preferences', 'score_rank_based', 'score_with_vocab_uncertainty'])


In [11]:
# from open_pref_eval.scoring import score_ipo, score_preferences, score_confidence_weighted, score_with_entropy_weight, score_seq_entropy_weighted, score_rank, score_cumsum, score_agg, score_power_mean, score_with_decay, score_percentile, score_weighted_prob, score_with_weight, score_f_alpha_divergance
# score_fns = {
#     "ipo": score_ipo,
#     "rank": score_rank,
#     "preferences": score_preferences,
#     'confidence_weighted': score_confidence_weighted,
#     'with_entropy_weight': score_with_entropy_weight,
#     "cumsum": score_cumsum,
#     # "weighted": score_weighted, # littlediff
#     "weighted_prob": score_weighted_prob, # nan
#     "with_weight": score_with_weight,
#     "f_alpha_divergance": score_f_alpha_divergance,
#     # "f_divergance": score_f_divergance,
#     # "min": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.min(x, dim=-1).values),
#     # "max": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.max(x, dim=-1).values),
#     "mean": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.mean(x, dim=-1)),
#     "median": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.median(x, dim=-1).values),
#     "std": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.std(x, dim=-1)),
#     "seq_entropy_weighted": score_seq_entropy_weighted,
#     # "certainty_weighted": score_certainty_weighted,
#     "power_mean": score_power_mean,
#     "with_decay": score_with_decay,
#     "percentile": score_percentile,
#     # "1st_diverg": score_1st_diverg,
#     # "perplexity_ratio": score_perplexity_ratio,
#     'confidence_weighted0.5': partial(score_confidence_weighted, T=0.5),
#     'confidence_weighted5': partial(score_confidence_weighted, T=5.0),
#     'confidence_weighted10': partial(score_confidence_weighted, T=10.0),
#     'with_entropy_weight0.5': partial(score_with_entropy_weight, alpha=0.5),
#     'with_entropy_weight5': partial(score_with_entropy_weight, alpha=5.0),
#     'with_entropy_weight10': partial(score_with_entropy_weight, alpha=10.0),

# }


In [None]:
results, df_raw1 = evaluate_model(
    model=model,
    tokenizer=tokenizer,
    datasets=datasets,
    batch_size=6,
    max_length=1024,
    max_prompt_length=512,
    verbose=2,
    score_fn=score_fns
) 
df_raw1.fillna({'adapter': model_name}, inplace=True)
# results

  0%|          | 0/8 [00:00<?, ?dataset/s]

Eval toxic-dpo-v0.2-train[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-05-25 13:23:29.629[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.evaluation[0m:[36meval_dataset[0m:[36m203[0m - [34m[1mDetected adapters: [None, 'default'][0m


Eval imdb_preferences-test[:100]:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-05-25 13:24:33.765[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.evaluation[0m:[36meval_dataset[0m:[36m203[0m - [34m[1mDetected adapters: [None, 'default'][0m
[32m2025-05-25 13:24:38.570[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m85[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-25 13:24:42.743[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m85[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 33.33% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-25 13:24:52.423[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m85[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-25 13:24:59.463[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m

In [None]:
model_id = 'snake7gun/tiny-random-qwen3'
model2 = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto",
    torch_dtype=torch.bfloat16, 
    quantization_config=quantization_config
)
tokenizer2 = AutoTokenizer.from_pretrained(model_id)

In [None]:
results2, df_raw2 = evaluate_model(
    model=model2,
    tokenizer=tokenizer2,
    datasets=datasets,
    batch_size=6,
    max_length=1024,
    max_prompt_length=512,
    verbose=2,
    score_fn=score_fns
) 
# results
df_raw2['adapter'] = model_id


In [None]:
df_raw = pd.concat([df_raw1, df_raw2], axis=0)

How to do this? So the logprobs are uncalibrated and kind of meaningless, but the ranking is good. So we just look at which is prefered. And take the mean

In [None]:
def rename_ds(s):
    s = s.replace('_preferences', '')
    s = s.replace('ethics_', '')
    s = s.replace('mmlu-', '')
    s = '-'.join(s.split('-')[:-1])
    return s

df_raw2a = df_raw.copy()
df_raw2a['dataset'] = df_raw2a['dataset'].apply(rename_ds)

In [None]:
cols_score = [c for c in df_raw.columns if c.startswith('score_') and (c.endswith('sigmoid') or c.endswith("correct"))]
# # df_raw[cols_score]
# df_raw2[cols_score]
cols_score

In [None]:
# df_raw.groupby(['model', 'dataset'])[cols_score].apply(lambda x: x.isna().max()).max()

In [None]:
d = df_raw[cols_score].copy()
# d -= d.min()
# d /= d.abs().max()
d.plot.hist(bins=155, range=[-5, 5], alpha=0.5, legend=True, density=True, style="stepped")
d.describe()


In [None]:
# what if we apply sigmoid before mean?

df_raw2 = df_raw2a.copy()
# df_raw2[cols_score] = df_raw2[cols_score] > 0.5 # it's in log, so which is prefered
# df_raw2[cols_score] = df_raw2[cols_score].apply(sigmoid)

for c in cols_score:
    df_agg =  df_raw2.groupby(['dataset', 'adapter'], dropna=False)[c].mean().unstack()
    radar_plot(df_agg)
    plt.title(c)
    plt.show()
    display(df_agg)
    
    # df_raw2['prob2'] = df_raw2[c] * df_raw2['norm_policy_weights']
    # df_agg =  df_raw2.groupby(['dataset', 'adapter'], dropna=False)['prob2'].mean().unstack()
    # # df_agg = df_agg.apply(sigmoid)
    # radar_plot(df_agg)
    # plt.title(c + '_norm')
    # plt.show()

In [None]:
# # what if we apply sigmoid before mean?
# cols_score = [c for c in df_raw.columns if c.startswith('score_')]
# df_raw2 = df_raw.copy()
# # df_raw2[cols_score] = df_raw2[cols_score].apply(sigmoid) # it's in log, so which is prefered

# for c in cols_score:
#     df_agg =  df_raw2.groupby(['dataset', 'adapter'], dropna=False)[c].mean().unstack()
#     df_agg = df_agg.apply(sigmoid)
#     radar_plot(df_agg)
#     plt.title(c)
#     plt.show()
    
#     # df_raw2['prob2'] = df_raw2[c] * df_raw2['norm_policy_weights']
#     # df_agg =  df_raw2.groupby(['dataset', 'adapter'], dropna=False)['prob2'].mean().unstack()
#     # # df_agg = df_agg.apply(sigmoid)
#     # radar_plot(df_agg)
#     # plt.title(c + '_norm')
#     # plt.show()

In [None]:
# df_agg =  df_raw.groupby(['dataset', 'adapter'], dropna=False)['prob'].mean().unstack()

# radar_plot(df_agg)
# df_agg

In [None]:
# df_raw_bool.pivot(index='adapter', columns='dataset', values=c)
import pandas as pd


In [None]:
# df_agg =  df_raw.groupby(['dataset'], dropna=False)['prob'].mean().to_frame()#.unstack()

# radar_plot(df_agg)
# df_agg

In [None]:
import numpy as np


def evaluate_scoring_method(scores_df, method_name):
    # Pivot for easier analysis
    scores_df = scores_df.copy()
    mins = scores_df[method_name].min()
    maxs = scores_df[method_name].max()
    scores_df[method_name] = scores_df[method_name].replace([np.inf, -np.inf], np.nan)
    scores = pd.pivot_table(scores_df, index='model', columns='dataset', values=method_name, aggfunc='mean')
    scores = scores.clip(lower=0, upper=1)  # Clip scores to [0, 1] range
    # print(scores)
    
    # 1. IMDB should be high (most models > 0.8)
    imdb_score = scores['imdb'].drop(index='snake7gun/tiny-random-qwen3').mean()
    
    # 2. Hard datasets should be low (if you have a hard_math dataset)
    hard_math_score = scores['elementary_mathematics'].mean()# if 'elementary_mathematics' in scores else 0.5
    hard_math_penalty = 1 - abs(hard_math_score - 0.5)

    # 3. Random model should be ~0.5
    random_model = 'snake7gun/tiny-random-qwen3'  # your random model
    random_deviation = abs(scores.loc[random_model].mean() - 0.5)
    random_penalty = 1 - random_deviation  # 1 is good, 0 is bad
    
    # FIXME we want a bit of contrast in all datasets, not a lot in one
    # 4. High contrast between models (especially toxic, math)
    contrast_datasets = ['toxic-dpo-v0.2', 'imdb', 'truthful_qa', 'elementary_mathematics',
       'expression-commonsense', 'expression-utilitarianism',
       'expression-justice', 'expression-deontology' ]
    contrasts = [scores[ds].std() / scores[ds].mean().clip(0.001) for ds in contrast_datasets if ds in scores]
    avg_contrast = np.prod(contrasts) ** (1/len(contrasts)) if contrasts else 0



    # avg_contrast = scores.std() / scores.mean() if not scores.empty else 0
    
    # 5. censored vs uncensored should differ on toxic
    if 'toxic-dpo-v0.2' in scores:
        # Assuming censored models score low, uncensored high
        toxic_spread = scores['toxic-dpo-v0.2'].max() - scores['toxic-dpo-v0.2'].min()
    else:
        toxic_spread = 0
    
    # Combined score
    quality = (
        imdb_score * 2 +              # weight easy dataset performance
        random_penalty * 3 +          # important: random = 0.5
        avg_contrast * 2 +            # discrimination power
        toxic_spread +                 # specific contrast we expect
        hard_math_penalty               # weight hard dataset performance
    ) / 10  # normalize to [0, 1]

    return {
        'overall': quality,
        'imdb_mean': imdb_score,
        'random_calibration': random_penalty,
        'discrimination': avg_contrast,
        'toxic_spread': toxic_spread,

        'hard_math': hard_math_score,
        'min': mins,
        'max': maxs,

    }


# cols = [c for c in df2.columns if 'score' in c]
# res = {}
# for c in cols:
#     v = evaluate_scoring_method(df2, c)
#     res[c] = v

#     # df2['correct2'] = df2[c]>0.5
#     # v = evaluate_scoring_method(df2, 'correct2')
#     # res[f'{c}_bool'] = v


# res = pd.DataFrame(res).T.sort_values('overall', ascending=False)
# res

In [None]:
df_raw[cols_score].describe()

In [None]:
# scores = pd.pivot_table(df_raw, index='adapter', columns='dataset', values=cols_score, aggfunc='mean')
# scores.describe()

In [None]:

# df_raw_bool = df_raw.copy()
# df_raw_bool[cols_score] = df_raw_bool[cols_score] > 0.5 # it's in log, so which is prefered
# df_raw_bool['dataset'] = df_raw_bool['dataset'].apply(rename_ds)

df_raw2 = df_raw.copy()
cols_score = [c for c in df_raw2.columns if c.startswith('score_')]#+['correct']
# df_raw2[cols_score] = df_raw2[cols_score] > 0.5 # it's in log, so which is prefered
df_raw2['dataset'] = df_raw2['dataset'].apply(rename_ds)    


res = {}
for c in cols_score:
    v2 = evaluate_scoring_method(df_raw2, c)
    # print(f"{v2['overall']:.2f} {c}")
    res[c] = v2



In [None]:
# OK what's happening here seems to be that a lot, of even the IMBD ones are cropped out, that's is not right!

In [None]:
x = df_raw2.query('dataset == "imdb"')[cols_score+['adapter']]#[['adapter', 'score_ipo__sigmoid']]
x = x.set_index('adapter').sort_values('score_ipo__sigmoid', ascending=False)
x = x > 0.5
x.groupby('adapter').mean()#.sort_values('score_ipo__sigmoid', ascending=False)

In [None]:
res2 = pd.DataFrame(res).T.sort_values('overall', ascending=False)

res2 = res2[res2['max'] <= 1.0]
res2 = res2[res2['min'] >= 0.0]
res2