In [1]:
import re, os, sys, json, time, requests
from pathlib import Path
import pandas as pd
import numpy as np

# TODO just load the parquets and run parse_eval
# or run my own eval

In [2]:
# TODO load config.json, load eval.parquet
fs = sorted(Path('../outputs').glob('**/eval.parquet'))
fs;

In [3]:
def load_config(f):
    # print(f.parent.parent.stem)
    config = json.load((f.parent / 'config.json').open())
    log = (f.parent / 'log.txt').open().read()

    # dataframe of all non list/dict/tuple in config
    config_df = pd.DataFrame({k: v for k, v in config.items() if not isinstance(v, (list, dict, tuple))}, index=[0])
    config_df['log'] = log
    config_df['file'] = f
    config_df['ts'] = f.parent.stem
    try:
        _, method, _ = f.parent.parent.stem.split('_', 2) # this is {model}_{method}_{dataset}
    except ValueError as e:
        print(e)
        print(f"cannot split `{f.parent.parent.stem}` from {f} like [f.parent.parent.stem.split('_', 3)]")
        method = ''
    config_df['method'] = method

    # put key cols first
    key_columns = ['dataset', 'base_model']
    columns = key_columns + [c for c in config_df.columns if not c in key_columns]
    return config_df[columns]

configs = [load_config(f) for f in fs]
df_configs = pd.concat(configs)
df_configs



  df_configs = pd.concat(configs)


Unnamed: 0,dataset,base_model,verbose,dev,load_in_4bit,load_in_8bit,use_gradient_checkpointing,batch_size,n_samples,eval_samples,...,ts,method,β,reverse_pref,scale_orth,neg_slope,mag_clip,weight_dim,collect_input,collect_hs
0,us_history_textbook,princeton-nlp/Llama-3-Base-8B-SFT,1,False,False,False,False,16,5400,,...,20241013042626,dpo,,,,,,,,
0,us_history_textbook,princeton-nlp/Llama-3-Base-8B-SFT,1,False,False,False,False,16,5400,,...,20241013103646,projbp,0.2,False,False,0.8,0.6,,,
0,us_history_textbook,princeton-nlp/Llama-3-Base-8B-SFT,1,False,False,False,False,16,5400,,...,20241013044358,projgrad,0.8,True,False,0.0,,1.0,,
0,us_history_textbook,princeton-nlp/Llama-3-Base-8B-SFT,1,False,False,False,False,16,5400,,...,20241013055756,sideETHERMSE,,,,,,,True,True
0,us_history_textbook,princeton-nlp/Llama-3-Base-8B-SFT,1,False,False,False,False,16,5400,,...,20241013040725,sideETHERPrefVec,,,,,,,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,us_history_textbook,wassname/llama-3-2-1b-sft,0,False,False,False,False,16,3600,128,...,2024-10-05_05-35-47,side-SVD-MSE,,,,,,,False,False
0,us_history_textbook,wassname/llama-3-2-1b-sft,1,False,False,False,False,16,3600,,...,2024-10-11_00-09-35,side-SVD-MSE,,,,,,,True,True
0,us_history_textbook,wassname/llama-3-2-1b-sft,2,True,True,False,False,2,64,750,...,2024-10-11_04-36-06,side-SVD-MSE,,,,,,,True,True
0,us_history_textbook,wassname/llama-3-2-1b-sft,1,False,False,False,False,16,3600,,...,2024-10-11_00-05-32,side-SVD-PrefVec,,,,,,,True,True


In [4]:
# df_evals = [pd.read_parquet(f) for f in fs]
# df_eval = pd.concat(df_evals).reset_index()
# df_eval
from reprpo.training import parse_eval, key_metrics
from tqdm.auto import tqdm

data_acc = []
data_rel_acc = []
data_rel_ppl = []
data_rel_pref = []

for i in tqdm(range(len(fs))):

    df_res2= pd.read_parquet(fs[i])
    config = df_configs.iloc[i]

    ds_alias = dict(list(zip(["train", "test", "oos", "rnd"], df_res2['dataset'].unique())))
    # assert ds_alias['train']==config['dataset']
    assert 'train' in ds_alias['train']
    adapter_name = df_res2[["adapter"]].query('adapter!="base"').values[0, 0]
    df_acc = (
        df_res2.groupby(["dataset", "adapter"], dropna=False)["correct"]
        .mean()
        .unstack()
        .T
    )
    ds_alias_rev = {v:k for k, v in ds_alias.items()}
    df_acc = df_acc.rename(columns=ds_alias_rev)
    
    df_rel = key_metrics(df_res2, adapter_name, ds_alias)
    
    df_acc = df_acc[['train', 'test', 'oos', 'rnd']]

    data_acc.append(dict(
        base_model=config['base_model'],
        train_dataset=config['dataset'],
        method=adapter_name,
        **df_acc.loc[adapter_name].to_dict()
    ))
    data_acc.append(dict(
        base_model=config['base_model'],
        train_dataset=config['dataset'],
        method='base',
        **df_acc.loc['base'].to_dict()
    ))

    data_rel_acc.append(dict(
        base_model=config['base_model'],
        train_dataset=config['dataset'],
        method=adapter_name,
        **df_rel.iloc[0].to_dict()
    ))
    data_rel_ppl.append(dict(
        base_model=config['base_model'],
        train_dataset=config['dataset'],
        method=adapter_name,
        **df_rel.iloc[1].to_dict()
    ))
    data_rel_pref.append(dict(
        base_model=config['base_model'],
        train_dataset=config['dataset'],
        method=adapter_name,
        **df_rel.iloc[2].to_dict()
     ))


  0%|          | 0/2367 [00:00<?, ?it/s]

In [5]:
import seaborn as sns
cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)
def style_df(df, caption=''):
    return (df.style
        .background_gradient(cmap, axis=1)
        # .set_caption(caption)
        .format(precision=3)
        .highlight_max(axis=0, props='font-weight:bold;')
    )


In [6]:
metrics = dict(
    acc=data_acc, 
    rel_acc=data_rel_acc, 
    rel_ppl=data_rel_ppl,
    rel_pref=data_rel_pref,
)

for metric, data in metrics.items():
    print(f'# {metric}:\n')
    for (base_model,train_dataset),df in pd.DataFrame(data).groupby(['base_model', 'train_dataset']):
        print(f'## {base_model} - {train_dataset}')
        df = df.drop(['base_model','train_dataset'], axis='columns')
        df = df.groupby('method', as_index=True).mean().sort_values('oos', ascending=False).dropna(axis=1)
        if (
            (len(df)>2) and 
            (df.std()>0).all()
        ):
                display(style_df(df, f'{metric}: {base_model} {train_dataset}'))
        else:
             print(f'skipped {df.shape}')
            

    print('\n')

# acc:

## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_easy


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,0.985,0.955,0.769,0.632
side-ETHER-PrefVec,0.985,0.95,0.767,0.653
projgrad,0.999,0.982,0.759,0.679
base,0.973,0.956,0.741,0.677
dpo,0.999,0.978,0.739,0.649


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_low_quality


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,0.999,0.991,0.111,0.705
side-ETHER-PrefVec,0.912,0.92,0.111,0.673
side-None-PrefVec,0.903,0.919,0.109,0.684
base,0.829,0.863,0.108,0.677
dpo,0.997,0.992,0.105,0.684


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_mmlu


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,0.974,0.829,0.761,0.747
dpo,0.975,0.829,0.745,0.719
side-ETHER-PrefVec,0.832,0.79,0.737,0.771
side-None-PrefVec,0.835,0.793,0.715,0.787
base,0.779,0.755,0.701,0.796


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_short
skipped (2, 4)
## princeton-nlp/Llama-3-Base-8B-SFT - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-HRA-PrefVec,0.998,0.999,0.903,0.64
side-Ortho-PrefVec,0.998,0.999,0.902,0.643
side-SVD-PrefVec,0.999,0.998,0.901,0.645
side-None-PrefVec,0.998,0.996,0.901,0.66
side-ETHER-PrefVec,0.996,0.997,0.889,0.654
dpo,1.0,0.996,0.887,0.7
projgrad,1.0,0.996,0.88,0.691
side-SVD-Rank,0.998,0.989,0.871,0.594
side-Ortho-Rank,0.998,0.991,0.856,0.655
side-HRA-MSE,0.988,0.994,0.84,0.663


## reciprocate/tiny-llama - us_history_textbook
skipped (3, 4)
## wassname/llama-3-2-1b-sft - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-SVD-PrefVec,0.987,0.977,0.8,0.464
side-HRA-PrefVec,0.987,0.977,0.8,0.464
side-None-PrefVec,0.987,0.977,0.8,0.464
side-Ortho-Rank,0.984,0.962,0.727,0.5
side-ETHER-PrefVec,0.926,0.962,0.72,0.446
side-SVD-Rank,0.993,0.965,0.719,0.504
side-ETHER-Rank,0.993,0.965,0.719,0.504
side-None-Rank,0.993,0.965,0.719,0.504
dpo,0.959,0.96,0.718,0.509
projgrad,0.955,0.956,0.713,0.508




# rel_acc:

## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_easy


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,1.012,0.999,1.038,0.933
side-ETHER-PrefVec,1.012,0.994,1.035,0.964
projgrad,1.026,1.027,1.024,1.002
dpo,1.026,1.023,0.996,0.959


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_low_quality


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,1.204,1.148,1.025,1.041
side-ETHER-PrefVec,1.1,1.066,1.025,0.994
side-None-PrefVec,1.088,1.065,1.012,1.01
dpo,1.203,1.15,0.975,1.01


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_mmlu


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,1.251,1.099,1.086,0.938
dpo,1.253,1.099,1.063,0.904
side-ETHER-PrefVec,1.068,1.047,1.051,0.968
side-None-PrefVec,1.072,1.051,1.019,0.988


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_short
skipped (1, 4)
## princeton-nlp/Llama-3-Base-8B-SFT - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-HRA-PrefVec,1.01,1.009,1.134,0.946
side-Ortho-PrefVec,1.01,1.009,1.133,0.949
side-SVD-PrefVec,1.011,1.009,1.132,0.952
side-None-PrefVec,1.01,1.006,1.132,0.974
side-ETHER-PrefVec,1.008,1.008,1.116,0.965
dpo,1.012,1.007,1.114,1.034
projgrad,1.012,1.007,1.106,1.02
side-SVD-Rank,1.01,1.0,1.095,0.877
side-Ortho-Rank,1.01,1.001,1.075,0.967
side-HRA-MSE,1.0,1.004,1.055,0.979


## reciprocate/tiny-llama - us_history_textbook
skipped (2, 4)
## wassname/llama-3-2-1b-sft - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,1.019,1.011,1.156,0.983
side-SVD-PrefVec,1.019,1.011,1.156,0.983
side-HRA-PrefVec,1.019,1.011,1.156,0.983
side-Ortho-Rank,1.017,0.995,1.051,1.059
side-ETHER-PrefVec,0.979,0.969,1.039,1.0
side-SVD-Rank,1.026,0.999,1.039,1.068
side-ETHER-Rank,1.026,0.999,1.039,1.068
side-None-Rank,1.026,0.999,1.039,1.068
dpo,1.015,0.968,1.038,1.144
projgrad,1.011,0.964,1.03,1.14




# rel_ppl:

## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_easy


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,0.848,0.833,0.981,0.735
side-ETHER-PrefVec,0.831,0.796,0.948,0.666
projgrad,0.0,0.0,0.012,0.0
dpo,0.0,0.0,0.003,0.0


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_low_quality


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,1.086,1.054,1.006,1.067
side-ETHER-PrefVec,1.141,1.103,0.982,0.987
projgrad,0.012,0.008,0.036,0.081
dpo,0.009,0.006,0.026,0.004


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_mmlu


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-ETHER-PrefVec,0.964,1.075,0.875,0.994
side-None-PrefVec,0.945,0.972,0.808,0.872
projgrad,0.003,0.0,0.001,0.013
dpo,0.002,0.0,0.001,0.005


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_short
skipped (1, 4)
## princeton-nlp/Llama-3-Base-8B-SFT - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-HRA-MSE,1.002,1.003,1.0,0.922
side-None-MSE,1.002,1.002,1.0,0.928
side-SVD-MSE,1.001,1.001,0.998,0.944
side-Ortho-MSE,1.002,1.002,0.998,0.928
side-ETHER-MSE,1.001,1.001,0.998,0.914
side-ETHER-PrefVec,0.953,0.952,0.957,0.769
side-SVD-PrefVec,0.95,0.948,0.955,0.699
side-Ortho-PrefVec,0.908,0.909,0.917,0.7
side-HRA-PrefVec,0.891,0.892,0.903,0.692
side-None-PrefVec,0.813,0.813,0.814,0.731


## reciprocate/tiny-llama - us_history_textbook
skipped (2, 4)
## wassname/llama-3-2-1b-sft - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-MSE,0.999,1.001,0.997,0.965
side-Ortho-MSE,1.001,1.003,0.996,1.004
side-ETHER-MSE,1.001,1.003,0.996,1.004
side-HRA-MSE,1.001,1.003,0.996,1.004
side-SVD-MSE,0.858,0.858,0.855,0.845
side-SVD-PrefVec,0.762,0.77,0.67,0.859
side-HRA-PrefVec,0.762,0.77,0.67,0.859
side-None-PrefVec,0.762,0.77,0.67,0.859
projbp,0.722,0.712,0.666,0.848
side-Ortho-PrefVec,0.688,0.66,0.601,0.678




# rel_pref:

## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_easy


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,184.491,151.362,106.512,14.436
dpo,194.976,159.56,103.501,14.844
side-ETHER-PrefVec,6.95,6.115,7.833,0.132
side-None-PrefVec,4.661,4.322,6.76,-0.134


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_low_quality


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
side-None-PrefVec,54.212,55.715,-1.27,0.066
side-ETHER-PrefVec,59.978,62.132,-3.989,0.285
projgrad,387.005,396.556,-95.526,1.873
dpo,402.367,417.062,-106.47,0.942


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_mmlu


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dpo,132.097,84.356,54.608,22.818
projgrad,118.423,73.527,48.958,22.649
side-ETHER-PrefVec,13.781,9.398,7.732,2.376
side-None-PrefVec,12.287,7.499,6.974,0.887


## princeton-nlp/Llama-3-Base-8B-SFT - alpaca_short
skipped (1, 4)
## princeton-nlp/Llama-3-Base-8B-SFT - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
projgrad,419.315,400.767,207.197,6.557
dpo,370.233,353.863,193.468,7.55
side-SVD-Rank,75.595,68.656,48.926,0.432
side-Ortho-Rank,80.533,75.384,48.461,0.876
side-None-Rank,82.202,77.22,40.024,0.481
side-None-PrefVec,28.4,23.872,38.523,0.273
side-HRA-PrefVec,24.369,20.753,33.58,-0.066
side-ETHER-Rank,57.042,52.891,32.688,0.695
side-Ortho-PrefVec,23.436,20.063,31.756,-0.045
side-HRA-Rank,60.387,55.132,29.643,1.527


## reciprocate/tiny-llama - us_history_textbook
skipped (2, 4)
## wassname/llama-3-2-1b-sft - us_history_textbook


Unnamed: 0_level_0,train,test,oos,rnd
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dpo,325.317,346.639,159.94,0.071
projgrad,299.714,320.749,139.283,-0.062
side-Ortho-Rank,70.039,62.565,30.594,-0.148
side-ETHER-Rank,94.977,81.113,28.4,0.059
side-None-Rank,94.977,81.113,28.4,0.059
side-SVD-Rank,94.977,81.113,28.4,0.059
side-HRA-PrefVec,18.882,17.279,23.988,-0.073
side-None-PrefVec,18.882,17.279,23.988,-0.073
side-SVD-PrefVec,18.882,17.279,23.988,-0.073
side-HRA-Rank,56.833,60.863,16.074,0.074




