In [1]:
%reload_ext autoreload
%autoreload 2

In [3]:
# !pip install peft bitsandbytes -q
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from open_pref_eval.datasets import get_default_datasets
from open_pref_eval.evaluation import evaluate_models, evaluate_model, evaluate
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from transformers import BitsAndBytesConfig
from tqdm.auto import tqdm
from anycache import anycache
from open_pref_eval.plot.radar import radar_plot
from open_pref_eval.helpers.mem import clear_mem


quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [4]:
datasets = get_default_datasets(300)
datasets

[Dataset({
     features: ['prompt', 'chosen', 'rejected', 'id'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 300
 })]

In [5]:
models = [
    'snake7gun/tiny-random-qwen3',

    "unsloth/Qwen3-4B",
    "fakezeta/amoral-Qwen3-4B",
    "soob3123/amoral-qwen3-14B",
    
    "mlabonne/Qwen3-4B-abliterated",

    "wassname/qwen-7B-codefourchan",
    # "unsloth/gemma-2-9b-it",
    "opencompass/CompassJudger-1-7B-Instruct",
    "microsoft/Phi-4-mini-instruct",
    # "CohereLabs/c4ai-command-r7b-12-2024",
    # 'emergent-misalignment/Qwen-Coder-Insecure',
    # "drfellx/emergent_misalignment_test_qwen2.5-7B-Instruct",
    # "dpasch01/pp-llama3-8b-right-wing",
    # "dpasch01/pp-llama3-8b-left-wing",
    # "unsloth/DeepSeek-R1-Distill-Qwen-7B"
]

In [6]:
from open_pref_eval.scoring import first_nonzero, score_1st_diverg, score_preferences, score_ipo, score_cumsum, score_weighted, score_weighted_prob, score_with_weight, score_f_alpha_divergance, score_f_divergance, score_agg, score_certainty_weighted, score_entropy_weighted, score_power_mean, score_with_decay, score_percentile, score_1st_diverg, score_perplexity_ratio
score_fns = {
    "preferences": score_preferences,
    "cumsum": score_cumsum,
    "weighted": score_weighted, # littlediff
    "weighted_prob": score_weighted_prob, # nan
    "with_weight": score_with_weight,
    "f_alpha_divergance": score_f_alpha_divergance,
    "f_divergance": score_f_divergance,
    "min": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.min(x, dim=-1).values),
    "max": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.max(x, dim=-1).values),
    "mean": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.mean(x, dim=-1)),
    "median": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.median(x, dim=-1).values),
    "std": lambda *args, **kwargs: score_agg(*args, **kwargs, agg=lambda x: torch.std(x, dim=-1)),
    "ipo": score_ipo,
    "entropy_weighted": score_entropy_weighted,
    "certainty_weighted": score_certainty_weighted,
    "power_mean": score_power_mean,
    "with_decay": score_with_decay,
    "percentile": score_percentile,
    "1st_diverg": score_1st_diverg,
    "perplexity_ratio": score_perplexity_ratio,

}


In [None]:
@anycache('.anycache')
def eval_model(model_id):
    print(f"Evaluating {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, ) #trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_id, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config, device_map="auto", 
    # trust_remote_code=True
    )
    
    # Evaluate the model
    results, df_raw = evaluate_model(
        model=model,
        tokenizer=tokenizer,
        datasets=datasets,
        batch_size=6,
        max_length=1024,
        max_prompt_length=512,
        verbose=2,
        score_fn=score_fns
    ) 
    return results, df_raw

data = []
for model_id in tqdm(models):
    results, df_raw = eval_model(model_id)
    clear_mem()
    data.append(df_raw)

  0%|          | 0/8 [00:00<?, ?it/s]

Evaluating snake7gun/tiny-random-qwen3


  0%|          | 0/8 [00:00<?, ?dataset/s]

Eval toxic-dpo-v0.2-train[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[32m2025-05-24 14:16:02.350[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m91[0m - [34m[1mBatch Chosen were truncated to 512 tokens for 16.67% of samples. Consider increasing max_completion_length.[0m


Eval imdb_preferences-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

[32m2025-05-24 14:16:06.188[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:16:06.258[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 33.33% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:16:06.441[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:16:06.573[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:16:06.689[0m | 

Eval truthful_qa_preferences-validation[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval mmlu_preferences-elementary_mathematics-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-commonsense-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-utilitarianism-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-justice-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-deontology-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Evaluating unsloth/Qwen3-4B


tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/752 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?dataset/s]

Eval toxic-dpo-v0.2-train[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[32m2025-05-24 14:21:06.594[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m91[0m - [34m[1mBatch Chosen were truncated to 512 tokens for 16.67% of samples. Consider increasing max_completion_length.[0m


Eval imdb_preferences-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

[32m2025-05-24 14:21:46.314[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:21:47.725[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 33.33% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:21:51.397[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:21:54.180[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:21:56.302[0m | 

Eval truthful_qa_preferences-validation[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval mmlu_preferences-elementary_mathematics-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-commonsense-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-utilitarianism-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-justice-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-deontology-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Evaluating fakezeta/amoral-Qwen3-4B


tokenizer_config.json:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/817 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/33.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.08G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

  0%|          | 0/8 [00:00<?, ?dataset/s]

Eval toxic-dpo-v0.2-train[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
[32m2025-05-24 14:28:50.920[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m91[0m - [34m[1mBatch Chosen were truncated to 512 tokens for 16.67% of samples. Consider increasing max_completion_length.[0m


Eval imdb_preferences-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]

[32m2025-05-24 14:29:30.027[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:29:31.567[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 33.33% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:29:35.114[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:29:37.749[0m | [34m[1mDEBUG   [0m | [36mopen_pref_eval.trainer[0m:[36m__call__[0m:[36m87[0m - [34m[1mBatch Prompts were truncated to 512 tokens for 16.67% of samples. Consider increasing max_prompt_length.[0m
[32m2025-05-24 14:29:40.033[0m | 

Eval truthful_qa_preferences-validation[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval mmlu_preferences-elementary_mathematics-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-commonsense-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-utilitarianism-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-justice-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Eval ethics_expression_preferences-deontology-test[:300]:   0%|          | 0/50 [00:00<?, ?it/s]



Evaluating soob3123/amoral-qwen3-14B


tokenizer_config.json:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.73G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
import pandas as pd
df = pd.concat(data)

df.groupby(['model', 'dataset'])['correct'].mean().unstack()

In [None]:
def rename_ds(s):
    s = s.replace('_preferences', '')
    s = s.replace('ethics_', '')
    s = s.replace('mmlu-', '')
    s = s.replace('validation', 'test')
    s = s.replace('train', 'test')
    s = '-'.join(s.split('-test')[:-1])
    return s

df2 = df.copy()
df2['dataset'] = df2['dataset'].apply(rename_ds)
df2['dataset']

In [None]:
df2.dataset.unique()

In [None]:
import numpy as np


def evaluate_scoring_method(scores_df, method_name):
    # Pivot for easier analysis
    scores_df = scores_df.copy()
    scores_df[method_name] = scores_df[method_name].replace([np.inf, -np.inf], np.nan)
    scores = pd.pivot_table(scores_df, index='model', columns='dataset', values=method_name, aggfunc='mean')
    scores = scores.clip(lower=0, upper=1)  # Clip scores to [0, 1] range
    # print(scores)
    
    # 1. IMDB should be high (most models > 0.8)
    imdb_score = scores['imdb'].mean()
    
    # 2. Hard datasets should be low (if you have a hard_math dataset)
    hard_math_score = scores['elementary_mathematics'].mean()# if 'elementary_mathematics' in scores else 0.5
    hard_math_penalty = 1 - abs(hard_math_score - 0.5)

    # 3. Random model should be ~0.5
    random_model = 'snake7gun/tiny-random-qwen3'  # your random model
    random_deviation = abs(scores.loc[random_model].mean() - 0.5)
    random_penalty = 1 - random_deviation  # 1 is good, 0 is bad
    
    # 4. High contrast between models (especially toxic, math)
    contrast_datasets = ['toxic-dpo-v0.2', 'elementary_mathematics', 'expression-justice']
    contrasts = [scores[ds].std() for ds in contrast_datasets if ds in scores]
    avg_contrast = np.mean(contrasts)

    # 3. Discrimination - use robust measure
    contrast_datasets = ['toxic-dpo-v0.2', 'elementary_mathematics', 'expression-justice']
    contrasts = []
    for ds in contrast_datasets:
        if ds in scores and not scores[ds].isna().all():
            # Use IQR instead of std - more robust to outliers
            q75, q25 = scores[ds].quantile(0.75), scores[ds].quantile(0.25)
            iqr = q75 - q25
            # Normalize by median to make it scale-invariant
            median = scores[ds].median()
            if median > 0:
                contrasts.append(min(iqr / median, 2))  # cap at 2
    
    avg_contrast = np.mean(contrasts) if contrasts else 0
    
    # 5. Bonus: censored vs uncensored should differ on toxic
    if 'toxic-dpo-v0.2' in scores:
        # Assuming censored models score low, uncensored high
        toxic_spread = scores['toxic-dpo-v0.2'].max() - scores['toxic-dpo-v0.2'].min()
    else:
        toxic_spread = 0
    
    # Combined score
    quality = (
        imdb_score * 2 +              # weight easy dataset performance
        random_penalty * 3 +          # important: random = 0.5
        avg_contrast * 2 +            # discrimination power
        toxic_spread +                 # specific contrast we expect
        hard_math_penalty               # weight hard dataset performance
    ) / 10  # normalize to [0, 1]

    return {
        'overall': quality,
        'imdb_mean': imdb_score,
        'random_calibration': random_penalty,
        'discrimination': avg_contrast,
        'toxic_spread': toxic_spread,
        'hard_math': hard_math_score,

    }


cols = [c for c in df2.columns if 'score' in c]
res = {}
for c in cols:
    v = evaluate_scoring_method(df2, c)
    res[c] = v

    df2['correct2'] = df2[c]>0.5
    v = evaluate_scoring_method(df2, 'correct2')
    res[f'{c}_bool'] = v


res = pd.DataFrame(res).T.sort_values('overall', ascending=False)
res

In [None]:
df2.pivot_table(index='model', columns='dataset', values='score_ipo')

In [None]:
    
cols = [c for c in df2.columns if 'score' in c]
for c in cols:
    df2['correct2'] = df2[c]>0
    df_agg = df2.pivot_table(index='model', columns='dataset', values=c)
    print(c)
    display(df_agg)
    radar_plot(df_agg.T)
    plt.title(c)
    plt.show()


In [None]:
cols = [c for c in df2.columns if 'score' in c]
for c in cols:
    df2['correct2'] = df2[c]>0
    df_agg = df2.pivot_table(index='model', columns='dataset', values='correct2')
    print(c)
    display(df_agg)
    radar_plot(df_agg.T)
    plt.title(c)
    plt.show()



In [None]:
# TEST
# TOXIC should be low for most models
# IDMB should be high as it's easy
# random qern should be 50%

In [None]:
radar_plot(df_agg.T)