This notebook will load all the results, group them into comparable sets and plot/table them

In [77]:
import re, os, sys, json, time, requests
from pathlib import Path
import pandas as pd
import numpy as np

In [78]:
# TODO load config.json, load eval.parquet
fs = sorted(Path("../outputs").glob("**/eval.parquet"))
fs;

In [79]:
def load_config(f):
    # print(f.parent.parent.stem)
    config = json.load((f.parent / "config.json").open())
    log = (f.parent / "log.txt").open().read()

    # dataframe of all non list/dict/tuple in config
    config_df = pd.DataFrame(
        {k: v for k, v in config.items() if not isinstance(v, (list, dict, tuple))},
        index=[0],
    )
    config_df["log"] = log
    config_df["file"] = f
    config_df["ts"] = f.parent.stem
    try:
        _, method, _ = f.parent.parent.stem.split(
            "_", 2
        )  # this is {model}_{method}_{dataset}
    except ValueError as e:
        print(e)
        print(
            f"cannot split `{f.parent.parent.stem}` from {f} like [f.parent.parent.stem.split('_', 3)]"
        )
        method = ""
    config_df["method"] = method

    # put key cols first
    key_columns = ["dataset", "base_model"]
    columns = key_columns + [c for c in config_df.columns if not c in key_columns]
    return config_df[columns]


configs = [load_config(f) for f in fs]
df_configs = pd.concat(configs)
df_configs

Unnamed: 0,dataset,base_model,verbose,dev,load_in_4bit,load_in_8bit,use_gradient_checkpointing,batch_size,n_samples,eval_samples,...,collection_layers,collect_input,collect_hs,β,reverse_pref,weight_dim,scale_orth,neg_slope,mag_clip,use_pref_ref
0,alpaca_low_quality,wassname/llama-3-2-1b-sft,1,False,False,False,False,10,10000,,...,,,,,,,,,,
0,alpaca_low_quality,wassname/llama-3-2-1b-sft,1,False,False,False,False,10,10000,,...,,True,True,,,,,,,
0,alpaca_low_quality,wassname/llama-3-2-1b-sft,1,False,False,False,False,10,10000,,...,,True,True,,,,,,,
0,alpaca_low_quality,wassname/llama-3-2-1b-sft,1,False,False,False,False,10,10000,,...,,True,True,,,,,,,
0,alpaca_low_quality,wassname/llama-3-2-1b-sft,1,False,False,False,False,10,10000,,...,,,,0.8,True,1.0,False,0.0,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,unhelpful_alpaca,wassname/llama-3-2-1b-sft,2,False,False,False,False,10,1800,,...,,True,False,,,,,,,
0,unhelpful_alpaca,wassname/llama-3-2-1b-sft,2,False,False,False,False,10,1800,,...,,True,False,,,,,,,
0,unhelpful_alpaca,wassname/llama-3-2-1b-sft,2,False,False,False,False,10,1800,,...,,True,False,,,,,,,
0,unhelpful_alpaca,wassname/llama-3-2-1b-sft,2,False,False,False,False,10,1800,,...,,True,False,,,,,,,


In [80]:
# df_evals = [pd.read_parquet(f) for f in fs]
# df_eval = pd.concat(df_evals).reset_index()
# df_eval
from reprpo.training import parse_eval, key_metrics
from tqdm.auto import tqdm

data_acc = []
data_rel_acc = []
data_rel_ppl = []
data_rel_pref = []

for i in tqdm(range(len(fs))):
    try:
        df_res2 = pd.read_parquet(fs[i])
        config = df_configs.iloc[i]

        ds_alias = dict(
            list(zip(["train", "test", "oos", "rnd"], df_res2["dataset"].unique()))
        )
        # assert ds_alias['train']==config['dataset']
        assert "train" in ds_alias["train"]
        adapter_name = df_res2[["adapter"]].query('adapter!="base"').values[0, 0]
        df_acc = (
            df_res2.groupby(["dataset", "adapter"], dropna=False)["correct"]
            .mean()
            .unstack()
            .T
        )
        ds_alias_rev = {v: k for k, v in ds_alias.items()}
        df_acc = df_acc.rename(columns=ds_alias_rev)

        df_rel = key_metrics(df_res2, adapter_name, ds_alias)

        splits = list(ds_alias_rev.values())
        df_acc = df_acc[splits]

        data_acc.append(
            dict(
                base_model=config["base_model"],
                train_dataset=config["dataset"],
                method=adapter_name,
                ts=config["ts"],
                **df_acc.loc[adapter_name].to_dict(),
                **ds_alias_rev,
            )
        )
        data_acc.append(
            dict(
                base_model=config["base_model"],
                train_dataset=config["dataset"],
                method="base",
                ts=config["ts"],
                **df_acc.loc["base"].to_dict(),
                **ds_alias_rev,
            )
        )

        data_rel_acc.append(
            dict(
                base_model=config["base_model"],
                train_dataset=config["dataset"],
                method=adapter_name,
                ts=config["ts"],
                **df_rel.iloc[0].to_dict(),
                **ds_alias_rev,
            )
        )
        data_rel_ppl.append(
            dict(
                base_model=config["base_model"],
                train_dataset=config["dataset"],
                method=adapter_name,
                ts=config["ts"],
                **df_rel.iloc[1].to_dict(),
                **ds_alias_rev,
            )
        )
        data_rel_pref.append(
            dict(
                base_model=config["base_model"],
                train_dataset=config["dataset"],
                method=adapter_name,
                ts=config["ts"],
                **df_rel.iloc[2].to_dict(),
                **ds_alias_rev,
            )
        )
    except Exception as e:
        print(e)
        print(f"error in {fs[i]}")
        continue

  0%|          | 0/95 [00:00<?, ?it/s]

In [81]:
config.ts, config.n_samples, config.max_length
list(ds_alias_rev.values())
df_acc.loc[adapter_name].to_dict()

{'train': 0.07733333333333334,
 'test': 0.072,
 'oos': 0.4186991869918699,
 'rnd': 0.3821022727272727}

In [82]:
import seaborn as sns

cmap = cmap = sns.diverging_palette(5, 250, as_cmap=True)


def style_df(df, caption=""):
    return (
        df.style.background_gradient(cmap, axis=1)
        # .set_caption(caption)
        .format(precision=3)
        .highlight_max(axis=0, props="font-weight:bold;")
    )

In [96]:
metrics = dict(
    acc=data_acc,
    # rel_acc=data_rel_acc,
    # rel_ppl=data_rel_ppl,
    # rel_pref=data_rel_pref,
)

for metric, data in metrics.items():
    print(f"# {metric}:\n")
    for (base_model, train_dataset), df1 in pd.DataFrame(data).groupby(
        ["base_model", "train_dataset"]
    ):
        ts = df1["ts"].unique()[0]
        dataset_map = df1.dropna(axis=1, how="all").iloc[0, -4:]
        ds_alias = {v: k for k, v in dataset_map.to_dict().items()}
        print(f"\n## {base_model} - {train_dataset}")
        print(ts)
        df = df1.drop(["base_model", "train_dataset"], axis="columns")
        df = (
            df.groupby("method", as_index=True)[
                [
                    "train",
                    "test",
                    "oos",
                    "rnd",
                ]
            ]
            .mean()
            .sort_values("oos", ascending=False)
            .dropna(axis=1)
        )
        if (len(df) > 2) and (df.std() > 0).all():
            display(style_df(df, f"{metric}: {base_model} {train_dataset}"))
        else:
            print(f"skipped due to low amount of rows {df.index}")
            continue

        if metric == "acc":
            caption = f"""Table 2: Absolute accuracy after training with adapter on ds:`{train_dataset}` compared to base model `{base_model}` for various distribution shifts:"""
            for k in ['train', 'test', 'oos', 'rnd']:
                v = ds_alias[k]
                def rename_ds(x):
                    "genies_preferences-alpaca_low_quality-train[:750]" 
                    if "[" in x:
                        x = x.split("[")[0]
                    x = x.replace("genies_preferences-", "genies / ")
                    x = x.replace("ethics_expression_preferences-", "ethics / ")
                    # x = x.split("-")[-1]
                    x = x.replace('_', ' ').replace('-test', '\t(test)').replace('-train', '\t(train)')
                    return x

                v = rename_ds(v)
                caption += f"\n- `{k}` \t:`{v}`"
            print(caption)

    print("\n")

# acc:


## wassname/llama-3-2-1b-sft - alpaca_low_quality
2025-05-15_19-24-02


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hs-None-PrefVec,0.957,0.961,0.088,0.491
hs-ETHER-PrefVec,0.973,0.971,0.079,0.438
projgrad,0.988,0.983,0.077,0.347
dpo,0.989,0.981,0.073,0.355
hs-SupressedHS-PrefVec,0.968,0.968,0.071,0.484
base,0.833,0.851,0.068,0.361


Table 2: Absolute accuracy after training with adapter on ds:`alpaca_low_quality` compared to base model `wassname/llama-3-2-1b-sft` for various distribution shifts:
- `train` 	:`genies / alpaca low quality	(train)`
- `test` 	:`genies / alpaca low quality	(test)`
- `oos` 	:`genies / alpaca high quality	(test)`
- `rnd` 	:`ethics / justice	(test)`

## wassname/llama-3-2-1b-sft - math
2025-05-15_22-05-03


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hs-ETHER-PrefVec,0.764,0.66,0.46,0.382
hs-SupressedHS-PrefVec,0.773,0.665,0.348,0.378
projgrad,0.977,0.817,0.348,0.352
dpo,0.976,0.797,0.344,0.355
base,0.353,0.389,0.336,0.361
hs-None-PrefVec,0.741,0.663,0.336,0.369


Table 2: Absolute accuracy after training with adapter on ds:`math` compared to base model `wassname/llama-3-2-1b-sft` for various distribution shifts:
- `train` 	:`genies / math	(train)`
- `test` 	:`genies / math	(test)`
- `oos` 	:`genies / change my view	(test)`
- `rnd` 	:`ethics / justice	(test)`

## wassname/llama-3-2-1b-sft - math_easy
2025-05-15_17-05-36


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dpo,0.995,0.98,0.66,0.347
projgrad,0.995,0.984,0.648,0.347
hs-ETHER-PrefVec,0.957,0.953,0.487,0.361
hs-None-PrefVec,0.961,0.929,0.477,0.358
hs-SupressedHS-PrefVec,0.961,0.947,0.436,0.351
base,0.92,0.929,0.256,0.361


Table 2: Absolute accuracy after training with adapter on ds:`math_easy` compared to base model `wassname/llama-3-2-1b-sft` for various distribution shifts:
- `train` 	:`genies / math easy	(train)`
- `test` 	:`genies / math easy	(test)`
- `oos` 	:`genies / math hard	(test)`
- `rnd` 	:`ethics / justice	(test)`

## wassname/llama-3-2-1b-sft - truthful_qa
2025-05-13_21-10-35


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
base,0.389,0.4,0.589,0.361
dpo,0.988,0.816,0.559,0.376
projgrad,0.984,0.806,0.539,0.371
hs-ETHER-PrefVec,0.723,0.699,0.403,0.477
hs-None-PrefVec,0.738,0.713,0.402,0.509
hs-SupressedHS-PrefVec,0.732,0.715,0.388,0.49


Table 2: Absolute accuracy after training with adapter on ds:`truthful_qa` compared to base model `wassname/llama-3-2-1b-sft` for various distribution shifts:
- `train` 	:`genies / truthful qa	(train)`
- `test` 	:`genies / truthful qa	(test)`
- `oos` 	:`genies / alpaca mmlu	(test)`
- `rnd` 	:`ethics / justice	(test)`

## wassname/llama-3-2-1b-sft - unhelpful_alpaca
2025-04-02_16-18-39


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hs-HRA-PrefVec,0.993,0.994,0.762,0.386
hs-ETHER-PrefVec,0.997,0.994,0.665,0.373
hs-None-PrefVec,0.383,0.392,0.499,0.384
hs-SupressedHS-PrefVec,0.508,0.513,0.43,0.38
hs-SupressedHS-Rank,0.287,0.284,0.424,0.403
side-None-PrefVec,0.074,0.082,0.413,0.382
base,0.055,0.064,0.386,0.361
hs-SupressedHS-MSE,0.055,0.064,0.386,0.363
dpo,0.91,0.868,0.23,0.346
projgrad,0.91,0.866,0.223,0.347


Table 2: Absolute accuracy after training with adapter on ds:`unhelpful_alpaca` compared to base model `wassname/llama-3-2-1b-sft` for various distribution shifts:
- `train` 	:`genies / unhelpful alpaca	(train)`
- `test` 	:`genies / unhelpful alpaca	(test)`
- `oos` 	:`genies / illegal dont help	(test)`
- `rnd` 	:`ethics / justice	(test)`


