In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
# !pip install peft bitsandbytes -q
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from open_pref_eval.datasets import get_default_datasets
from open_pref_eval.evaluation import evaluate_models, evaluate_model, evaluate
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import torch
from transformers import BitsAndBytesConfig
from tqdm.auto import tqdm
from anycache import anycache
from open_pref_eval.plot.radar import radar_plot
from open_pref_eval.helpers.mem import clear_mem


quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
datasets = get_default_datasets(450)
datasets

In [None]:
models = [
    'snake7gun/tiny-random-qwen3',

    "Qwen/Qwen3-1.7B",
    "Qwen/Qwen3-4B",
    "Qwen/Qwen3-8B",
    "unsloth/Qwen3-14B",

    "microsoft/Phi-4-mini-instruct", # 4b
    
    "soob3123/amoral-gemma3-4B-v2",
    "unsloth/gemma-3-4b-it",
    
    # "mlabonne/Qwen3-4B-abliterated",
    # "unsloth/Qwen3-4B",
    "wassname/qwen-7B-codefourchan",
    "opencompass/CompassJudger-1-7B-Instruct",




    # "unsloth/gemma-2-9b-it",
    # 'google/gemma-2-2b',
    # "CohereLabs/c4ai-command-r7b-12-2024",
    # 'emergent-misalignment/Qwen-Coder-Insecure',
    # "drfellx/emergent_misalignment_test_qwen2.5-7B-Instruct",
    # "dpasch01/pp-llama3-8b-right-wing",
    # "dpasch01/pp-llama3-8b-left-wing",
    # "unsloth/DeepSeek-R1-Distill-Qwen-7B"
]

In [None]:
from open_pref_eval import scoring

score_fns = [c for c in dir(scoring) if c.startswith('score_')]
score_fns = {c: getattr(scoring, c) for c in score_fns if not c.startswith('_')}
print(score_fns.keys())

In [None]:
@anycache('.anycache2')
def eval_model(model_id):
    print(f"Evaluating {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id, ) #trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_id, 
        torch_dtype=torch.bfloat16, 
        quantization_config=quantization_config, device_map="auto", 
    # trust_remote_code=True
    )
    
    # Evaluate the model
    import warnings
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        results, df_raw = evaluate_model(
            model=model,
            tokenizer=tokenizer,
            datasets=datasets,
            batch_size=6,
            max_length=1024,
            max_prompt_length=512,
            verbose=1,
            score_fn=score_fns
        ) 
    return results, df_raw

data = []
for model_id in tqdm(models):
    results, df_raw = eval_model(model_id)
    clear_mem()
    data.append(df_raw)

In [None]:
import pandas as pd
df = pd.concat(data)
df['correct'] = (df['score_with_entropy_weight__sigmoid'] >= 0.5).astype(int) 
df.groupby(['model', 'dataset'])['correct'].mean().unstack()

In [None]:
# df.groupby(['model', 'dataset'])[cols].apply(lambda x: x.isna().mean())

In [None]:
def rename_ds(s):
    s = s.replace('_preferences', '')
    s = s.replace('ethics_', '')
    s = s.replace('mmlu-', '')
    s = s.replace('validation', 'test')
    s = s.replace('train', 'test')
    s = '-'.join(s.split('-test')[:-1])
    return s

df2 = df.copy()
df2['dataset'] = df2['dataset'].apply(rename_ds)
df2['dataset'].unique()

In [None]:
import numpy as np



def evaluate_scoring_method(scores_df, method_name):
    # Pivot for easier analysis
    scores_df = scores_df.copy()
    mins = scores_df[method_name].min()
    maxs = scores_df[method_name].max()
    scores_df[method_name] = scores_df[method_name].replace([np.inf, -np.inf], np.nan)
    scores = pd.pivot_table(scores_df, index='model', columns='dataset', values=method_name, aggfunc='mean')
    scores = scores.clip(lower=0, upper=1)  # Clip scores to [0, 1] range
    # print(scores)
    
    # 1. IMDB should be high (most models > 0.8)
    imdb_score = scores['imdb'].drop(index='snake7gun/tiny-random-qwen3').mean()
    
    # # 2. Hard datasets should be low (if you have a hard_math dataset)
    # hard_math_score = scores['elementary_mathematics'].mean()# if 'elementary_mathematics' in scores else 0.5
    # hard_math_penalty = 1 - abs(hard_math_score - 0.5)

    # 3. Random model should be ~0.5
    random_model = 'snake7gun/tiny-random-qwen3'  # your random model
    random_deviation = abs(scores.loc[random_model].mean() - 0.5)
    random_penalty = 1 - random_deviation  # 1 is good, 0 is bad
    
    # FIXME we want a bit of contrast in all datasets, not a lot in one
    # 4. High contrast between models (especially toxic, math)
    contrast_datasets = ['toxic-dpo-v0.2', 'imdb', 'truthful_qa', 'elementary_mathematics',
       'expression-commonsense', 'expression-utilitarianism',
       'expression-justice', 'expression-deontology' ]
    contrasts = [scores[ds].std() / scores[ds].mean().clip(0.001) for ds in contrast_datasets if ds in scores]
    avg_contrast = np.prod(contrasts) ** (1/len(contrasts)) if contrasts else 0



    # avg_contrast = scores.std() / scores.mean() if not scores.empty else 0
    
    # 5. censored vs uncensored should differ on toxic
    if 'toxic-dpo-v0.2' in scores:
        # Assuming censored models score low, uncensored high
        toxic_spread = scores['toxic-dpo-v0.2'].max() - scores['toxic-dpo-v0.2'].min()
    else:
        toxic_spread = 0
    
    # Combined score
    quality = (
        imdb_score * 2 +              # weight easy dataset performance
        random_penalty * 3 +          # important: random = 0.5
        avg_contrast * 2 +            # discrimination power
        toxic_spread                  # specific contrast we expect
        # hard_math_penalty               # weight hard dataset performance
    ) / 9  # normalize to [0, 1]

    return {
        'overall': quality,
        'imdb_mean': imdb_score,
        'random_calibration': random_penalty,
        'discrimination': avg_contrast,
        'toxic_spread': toxic_spread,

        # 'hard_math': hard_math_score,
        'min': mins,
        'max': maxs,
        'nan': scores_df[method_name].isna().sum(),
        'inf': scores_df[method_name].isin([np.inf, -np.inf]).sum(),

    }


cols = [c for c in df2.columns if 'score' in c]
res = {}
for c in cols:
    v = evaluate_scoring_method(df2, c)
    res[c] = v

    # df2['correct2'] = df2[c]>0.5
    # v = evaluate_scoring_method(df2, 'correct2')
    # res[f'{c}_bool'] = v


res = pd.DataFrame(res).T.sort_values('overall', ascending=False)
# res

In [None]:
# only show the ones in [0, 1] range
df_metrics = res[(res['min'] >= 0) & (res['max'] <= 1)].sort_values('discrimination', ascending=False)
df_metrics

In [None]:

# now only the ones where random calibration is > 0.9
df_metrics = df_metrics[df_metrics['random_calibration'] > 0.8]

# and discrimination is at least 50% as good as ipo
df_metrics = df_metrics[df_metrics['discrimination'] > (0.3 * df_metrics.loc['score_ipo__sigmoid', 'discrimination'])]

# and toxic_spread > 0
df_metrics = df_metrics[df_metrics['toxic_spread'] > 0.01]

# and imdb means is as at least 80% as good as ipo
df_metrics = df_metrics[df_metrics['imdb_mean'] > (0.8 * df_metrics.loc['score_ipo__sigmoid', 'imdb_mean'])]
df_metrics

In [None]:

cols = df_metrics.index#.tolist()
for c in cols:
    # df2['correct2'] = df2[c]>0
    df_agg = df2.pivot_table(index='model', columns='dataset', values=c)
    print(c)
    display(df_agg)
    radar_plot(df_agg.T)
    plt.legend(loc='upper right', bbox_to_anchor=(1.6, 1))
    plt.title(c)
    plt.show()
