# Imports

In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

import matplotlib
import pickle
import re
import shutil
import yaml

from functools import partial
from itertools import product
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.pyplot as plt
from multiprocessing import Pool
import numpy as np
import pandas as pd
import seaborn as sns

from tools.loader import load_npy, load_yaml_as_df, load_pkl, exist_metric, keep_split, is_full_group, load_metric_from_log, extract_log_loss

ROOT = os.path.dirname(os.getcwd())

plt.style.use('default')
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rc('font', family='Arial')
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.size'] = 10

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load data

In [None]:
root = '/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_data/TrustworthRLHF/causal-rm/results/baselines_binary'
exp_dirs = os.listdir(root)
exp_dirs = [os.path.join(root, exp_dir) for exp_dir in exp_dirs]

root2 = '/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_data/TrustworthRLHF/causal-rm/results/finetune'
exp_dirs2 = os.listdir(root2)
exp_dirs += [os.path.join(root2, exp_dir) for exp_dir in exp_dirs2]

def process_exp_dir(exp_dir):
    config_path = os.path.join(exp_dir, 'config.yaml')
    metric_path = os.path.join(exp_dir, 'performance.yaml')
    if not (os.path.exists(config_path) and os.path.exists(metric_path)):
        return None
    config = load_yaml_as_df(config_path)
    metric = load_yaml_as_df(metric_path)
    result = pd.concat([config, metric], axis=1)
    result.loc[:, 'exp_dir'] = exp_dir
    return result

worker_func = partial(process_exp_dir)
num_workers = 256
with Pool(processes=num_workers) as pool:
    dfa = list(pool.imap(worker_func, exp_dirs))
dfa = [d for d in dfa if d is not None]
dfa = pd.concat(dfa, ignore_index=True)

dfa.sort_values(by=['desc', 'data_name', 'seed', 'alpha'], inplace=True)
dfa.to_csv(os.path.join(ROOT, 'stats', 'baselines_binary.csv'), index=False)
dfa.head(4)

Unnamed: 0,alpha,batch_size,binary,clip_min,data_name,data_root,desc,estimator_name,forget_rate,hidden_dim,l2_reg,lr,model_name,monitor_on,num_epochs,num_gradual,output_dir,patience,r01,r10,rerun,seed,use_tqdm,w_reg,AUROC on eval,AUROC on test,MAE on eval,MAE on test,R2 on test,R2 on train,R2 on val,RMSE on eval,RMSE on test,exp_dir,is_training,Pearson on eval,Pearson on test,co_lambda,w_entropy,w_info,batch_size_prop,hidden_dim_prop,l2_imp,l2_prop,w_imp,w_prop,MAE prop on train,MAE prop on val,Max error prop on train,Max error prop on val,R2 prop on train,R2 prop on val,alp,beta,eps,m,lw_k,T,alpha_mix,lambda_u,num_steps,p_threshold,perturb_step,warmup_epochs,eta,batch_size_full,l2_noise,quant,w_noise
87,0.01,512,True,0.1,hs,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,co_teaching,co_teaching,0.2,25664,1e-06,0.0005,FsfairX-LLaMA3-RM-v0.1,train,600,10.0,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,30,0.1,0.2,False,42,False,0.2,0.501121,0.475189,0.315026,0.376994,-0.167744,0.164575,-0.130389,0.430699,0.502318,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,True,-0.007317,0.014394,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
88,0.02,512,True,0.1,hs,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,co_teaching,co_teaching,0.2,25664,1e-06,0.0005,FsfairX-LLaMA3-RM-v0.1,train,600,10.0,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,30,0.1,0.2,False,42,False,0.2,0.514725,0.513866,0.285092,0.363955,-0.158571,0.069901,-0.056649,0.417101,0.500341,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,True,0.033562,0.043504,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
89,0.05,512,True,0.1,hs,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,co_teaching,co_teaching,0.2,25664,1e-06,0.0005,FsfairX-LLaMA3-RM-v0.1,train,600,10.0,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,30,0.1,0.2,False,42,False,0.2,0.543427,0.463215,0.287604,0.370792,-0.214741,0.136391,-0.063178,0.419488,0.512326,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,True,0.072158,-0.009726,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
90,0.1,512,True,0.1,hs,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,co_teaching,co_teaching,0.2,25664,1e-06,0.0005,FsfairX-LLaMA3-RM-v0.1,train,600,10.0,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,30,0.1,0.2,False,42,False,0.2,0.50786,0.558926,0.301846,0.362337,-0.140778,0.168173,-0.132261,0.436475,0.496484,/mnt/tidalfs-bdsz01/dataset/llm_dataset/plc_da...,True,-0.010452,0.098342,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
dfa = pd.read_csv(os.path.join(ROOT, 'stats', 'baselines_binary.csv'))


  dfa = pd.read_csv(os.path.join(ROOT, 'stats', 'baselines_binary.csv'))


No Debias+PU models defined yet. Add model names to DEBIAS_PU_MODELS list when available.


Unnamed: 0_level_0,hs_AUROC,hs_NLL,hs_NDCG,hs_Recall,saferlhf_AUROC,saferlhf_NLL,saferlhf_NDCG,saferlhf_Recall,ufb_AUROC,ufb_NLL,ufb_NDCG,ufb_Recall
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


# results of $\rho_{01}=0.1$ and $\rho_{10}=0.2$

In [19]:
df = dfa.query("r10==0.1 and r01==0.2")

In [29]:

dfd = df[df['R2 on test'] <= 0]
for i, row in dfd.iterrows():
    exp_dir = row['exp_dir']
    shutil.rmtree(exp_dir)

## log debias

In [21]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']

columns = ['l2_reg', 'w_reg']
df.query("desc=='naive' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='naive' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='naive' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop']
df.query("desc=='ips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='ips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='ips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'w_prop']
df.query("desc=='mtips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='mtips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='mtips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'w_info', 'w_entropy']
df.query("desc=='cvib' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='cvib' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='cvib' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_imp', 'w_imp']
df.query("desc=='dr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='dr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='dr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'w_prop', 'w_imp']
df.query("desc=='mtdr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='mtdr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='mtdr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_imp', 'w_imp', 'eta']
df.query("desc=='sdr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='sdr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='sdr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]



# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_imp', 'w_imp', 'eta']
# df.query("desc=='sdr2' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='sdr2' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='sdr2' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,l2_prop,w_prop,l2_imp,w_imp,eta,R2 on test,MAE on test,Pearson on test
14295,1e-07,0.2,1e-07,0.1,2e-08,1.0,1.5,0.479947,0.242887,0.698618
14391,1e-07,0.2,1e-07,1.0,1e-08,0.2,1.0,0.478357,0.25515,0.703161
18890,5e-07,1.0,2e-07,0.1,2e-08,0.1,1.5,0.474239,0.244345,0.693775
18476,5e-07,0.1,2e-07,1.0,5e-08,0.2,1.2,0.470687,0.243512,0.694349
20377,5e-09,1.0,2e-07,0.1,5e-09,1.0,0.5,0.469488,0.261253,0.713722
18528,5e-07,0.1,5e-07,0.2,5e-08,0.1,1.5,0.468665,0.251177,0.696533
16252,1e-08,0.5,1e-07,1.0,1e-08,0.2,1.0,0.468403,0.244517,0.690862
18777,5e-07,0.2,5e-07,1.0,1e-08,0.2,1.2,0.466909,0.266964,0.704508
17529,2e-07,0.2,2e-07,1.0,5e-08,1.0,1.5,0.465167,0.237603,0.683743
20305,5e-09,1.0,1e-08,0.2,2e-07,1.0,2.0,0.465006,0.222668,0.68367


: 

## log denoise

In [33]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']


columns = ['l2_reg', 'w_reg', 'forget_rate', 'num_gradual']
df.query("desc=='co_teaching' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='co_teaching' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='co_teaching' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'forget_rate', 'num_gradual', 'co_lambda']
df.query("desc=='codis' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='codis' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='codis' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'lw_k', 'monitor_on']
df.query("desc=='labelwave' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='labelwave' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='labelwave' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]



columns = ['l2_reg', 'w_reg', 'eps', 'alp', 'beta', 'm']
df.query("desc=='eps_softmax' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='eps_softmax' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='eps_softmax' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'p_threshold', 'alpha_mix', 'lambda_u', 'num_steps', 'perturb_step']
df.query("desc=='robust_dividemix' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='robust_dividemix' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='robust_dividemix' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,p_threshold,alpha_mix,lambda_u,num_steps,perturb_step,R2 on test,MAE on test,Pearson on test
7714,1e-06,5.0,0.5,4.0,25.0,1.0,0.5,0.740458,0.137388,
7702,1e-06,2.0,0.5,4.0,25.0,1.0,0.5,0.738193,0.148762,
8129,1e-07,0.2,0.5,4.0,10.0,1.0,0.5,0.738132,0.136878,
8159,1e-07,2.0,0.5,4.0,25.0,1.0,0.5,0.734747,0.143497,
8135,1e-07,0.2,0.5,4.0,25.0,1.0,0.5,0.733863,0.13863,
7684,1e-06,1.0,0.5,4.0,10.0,1.0,0.5,0.733412,0.142963,
7672,1e-06,0.2,0.5,4.0,10.0,1.0,0.5,0.73274,0.141983,
7699,1e-06,2.0,0.5,0.5,25.0,1.0,0.5,0.73241,0.120542,
7708,1e-06,5.0,0.5,4.0,10.0,1.0,0.5,0.73208,0.147373,
7681,1e-06,1.0,0.5,0.5,10.0,1.0,0.5,0.731963,0.127577,


## log finetune

In [22]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']


columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_noise', 'w_noise', 'quant']
df.query("desc=='ome_ips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='ome_ips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='ome_ips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]



# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_noise', 'w_noise', 'l2_imp', 'w_imp', 'quant']
# df.query("desc=='ome_dr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ome_dr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ome_dr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,l2_prop,w_prop,l2_noise,w_noise,quant,R2 on test,MAE on test,Pearson on test
20761,1e-05,1.0,1e-05,0.2,1e-05,1.0,0.97,-0.09002,0.379993,0.314067
20613,1e-05,0.2,1e-05,0.2,1e-05,1.0,0.97,-0.093654,0.378976,0.313849
20633,1e-05,0.2,1e-05,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
20669,1e-05,0.2,1e-06,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
20705,1e-05,0.2,1e-07,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
22123,1e-07,0.2,1e-05,0.2,1e-06,1.0,0.7,-0.157012,0.500004,
21956,1e-07,0.1,1e-05,0.2,1e-07,0.1,0.97,-0.157012,0.500004,
21954,1e-07,0.1,1e-05,0.2,1e-06,1.0,0.97,-0.157012,0.500004,
21962,1e-07,0.1,1e-05,0.2,1e-08,0.1,0.97,-0.157012,0.500004,
22138,1e-07,0.2,1e-05,0.2,1e-08,0.1,0.97,-0.157012,0.500004,


## selection

In [16]:
dfs = df.copy()
dfs = dfs[dfs.alpha.isin([0.5])]

dfs_naive_hs = dfs.query("desc=='naive' and data_name=='hs' and l2_reg==1e-5 and w_reg==10.0")
dfs_naive_ufb = dfs.query("desc=='naive' and data_name=='ufb' and l2_reg==1e-6 and w_reg==1.0")
dfs_naive_safe = dfs.query("desc=='naive' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0")
dfs_naive = pd.concat([dfs_naive_hs, dfs_naive_ufb, dfs_naive_safe], ignore_index=True)


dfs_ips_hs = dfs.query("desc=='ips' and data_name=='hs' and l2_reg==1e-7 and w_reg==1.0 and l2_prop==1e-7 and w_prop==0.2")
dfs_ips_ufb = dfs.query("desc=='ips' and data_name=='ufb' and l2_reg==1e-7 and w_reg==2.0 and l2_prop==1e-7 and w_prop==2.0")
dfs_ips_safe = dfs.query("desc=='ips' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-5 and w_prop==0.2")
dfs_ips = pd.concat([dfs_ips_hs, dfs_ips_ufb, dfs_ips_safe], ignore_index=True)


dfs_mtips_hs = dfs.query("desc=='mtips' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.2 and w_prop==2.0")
dfs_mtips_ufb = dfs.query("desc=='mtips' and data_name=='ufb' and l2_reg==1e-6 and w_reg==2.0 and w_prop==0.2")
dfs_mtips_safe = dfs.query("desc=='mtips' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==2.0 and w_prop==0.2")
dfs_mtips = pd.concat([dfs_mtips_hs, dfs_mtips_ufb, dfs_mtips_safe], ignore_index=True)


dfs_cvib_hs = dfs.query("desc=='cvib' and data_name=='hs' and l2_reg==1e-7 and w_reg==10.0 and w_info==0.5 and w_entropy==1.0")
dfs_cvib_ufb = dfs.query("desc=='cvib' and data_name=='ufb' and l2_reg==5e-6 and w_reg==8.0 and w_info==0.2 and w_entropy==0.2")
dfs_cvib_safe = dfs.query("desc=='cvib' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==15.0 and w_info==0.1 and w_entropy==2.0")
dfs_cvib = pd.concat([dfs_cvib_hs, dfs_cvib_ufb, dfs_cvib_safe], ignore_index=True)


dfs_dr_hs = dfs.query("desc=='dr' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.1 and l2_prop==1e-5 and w_prop==0.2 and l2_imp==1e-6 and w_imp==1.0")
dfs_dr_ufb = dfs.query("desc=='dr' and data_name=='ufb' and l2_reg==1e-6 and w_reg==2.0 and l2_prop==1e-7 and w_prop==2.0 and l2_imp==1e-6 and w_imp==2.0")
dfs_dr_safe = dfs.query("desc=='dr' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==1.0 and l2_prop==1e-6 and w_prop==1.0 and l2_imp==1e-7 and w_imp==2.0")
dfs_dr = pd.concat([dfs_dr_hs, dfs_dr_ufb, dfs_dr_safe], ignore_index=True)


dfs_mtdr_hs = dfs.query("desc=='mtdr' and data_name=='hs' and l2_reg==1e-5 and w_reg==2.0 and w_prop==0.2 and w_imp==0.2")
dfs_mtdr_ufb = dfs.query("desc=='mtdr' and data_name=='ufb' and l2_reg==5e-5 and w_reg==1.0 and w_prop==0.2 and w_imp==0.1")
dfs_mtdr_safe = dfs.query("desc=='mtdr' and data_name=='saferlhf' and l2_reg==1e-5 and w_reg==2.0 and w_prop==2.0 and w_imp==1.0")
dfs_mtdr = pd.concat([dfs_mtdr_hs, dfs_mtdr_ufb, dfs_mtdr_safe], ignore_index=True)


dfs_sdr_hs = dfs.query("desc=='sdr' and data_name=='hs' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-7 and w_prop==1.0 and l2_imp==1e-7 and w_imp==2.0 and eta==1.0")
dfs_sdr_ufb = dfs.query("desc=='sdr' and data_name=='ufb' and l2_reg==1e-7 and w_reg==2.0 and l2_prop==1e-7 and w_prop==0.2 and l2_imp==1e-7 and w_imp==2.0 and eta==1.0")
dfs_sdr_safe = dfs.query("desc=='sdr' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-6 and w_prop==1.0 and l2_imp==1e-6 and w_imp==1.0 and eta==1.0")
dfs_sdr = pd.concat([dfs_sdr_hs, dfs_sdr_ufb, dfs_sdr_safe], ignore_index=True)


dfs_cotea_hs = dfs.query("desc=='co_teaching' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.2 and forget_rate==0.2 and num_gradual==10")
dfs_cotea_ufb = dfs.query("desc=='co_teaching' and data_name=='ufb' and l2_reg==2e-6 and w_reg==2.0 and forget_rate==0.2 and num_gradual==10")
dfs_cotea_safe = dfs.query("desc=='co_teaching' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==0.2 and forget_rate==0.2 and num_gradual==10")
dfs_cotea = pd.concat([dfs_cotea_hs, dfs_cotea_ufb, dfs_cotea_safe], ignore_index=True)


dfs_codis_hs = dfs.query("desc=='codis' and data_name=='hs' and l2_reg==5e-6 and w_reg==0.1 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.1")
dfs_codis_ufb = dfs.query("desc=='codis' and data_name=='ufb' and l2_reg==2e-6 and w_reg==15.0 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.2")
dfs_codis_safe = dfs.query("desc=='codis' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==2.0 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.1")
dfs_codis = pd.concat([dfs_codis_hs, dfs_codis_ufb, dfs_codis_safe], ignore_index=True)


dfs_lw_hs = dfs.query("desc=='labelwave' and data_name=='hs' and l2_reg==1e-5 and w_reg==1.0 and lw_k==10 and monitor_on=='loss'")
dfs_lw_ufb = dfs.query("desc=='labelwave' and data_name=='ufb' and l2_reg==1e-5 and w_reg==0.2 and lw_k==5 and monitor_on=='acc'")
dfs_lw_safe = dfs.query("desc=='labelwave' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==0.2 and lw_k==3 and monitor_on=='loss'")
dfs_lw = pd.concat([dfs_lw_hs, dfs_lw_ufb, dfs_lw_safe], ignore_index=True)


dfs_eps_hs = dfs.query("desc=='eps_softmax' and data_name=='hs' and l2_reg==1e-5 and w_reg==1.0 and eps==1e-8 and alp==0.05 and beta==0.1 and m==1e2")
dfs_eps_ufb = dfs.query("desc=='eps_softmax' and data_name=='ufb' and l2_reg==1e-5 and w_reg==1.0 and eps==1e-8 and alp==0.05 and beta==0.1 and m==1e3")
dfs_eps_safe = dfs.query("desc=='eps_softmax' and data_name=='saferlhf' and l2_reg==1e-5 and w_reg==0.5 and eps==1e-8 and alp==0.02 and beta==1.0 and m==1e2")
dfs_eps = pd.concat([dfs_eps_hs, dfs_eps_ufb, dfs_eps_safe], ignore_index=True)


dfs_rob_hs = dfs.query("desc=='robust_dividemix' and data_name=='hs' and l2_reg==1e-6 and w_reg==1.0 and p_threshold==0.5 and alpha_mix==0.5 and lambda_u==25 and num_steps==1 and perturb_step==0.5")
dfs_rob_ufb = dfs.query("desc=='robust_dividemix' and data_name=='ufb' and l2_reg==1e-7 and w_reg==0.2 and p_threshold==0.5 and alpha_mix==4 and lambda_u==10 and num_steps==1 and perturb_step==0.5")
dfs_rob_safe = dfs.query("desc=='robust_dividemix' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and p_threshold==0.5 and alpha_mix==0.5 and lambda_u==25 and num_steps==1 and perturb_step==0.1")
dfs_rob = pd.concat([dfs_rob_hs, dfs_rob_ufb, dfs_rob_safe], ignore_index=True)


dfs_other = dfs.query("desc not in ['naive', 'ips', 'mtips', 'cvib', 'dr', 'mtdr', 'sdr', 'co_teaching', 'codis', 'labelwave', 'eps_softmax', 'robust_dividemix']")
dfs = pd.concat([
    dfs_naive, 
    dfs_ips,
    dfs_mtips,
    dfs_cvib,
    dfs_dr,
    dfs_mtdr,
    dfs_sdr,
    dfs_cotea,
    dfs_codis,
    dfs_lw,
    dfs_eps,
    dfs_rob,
    dfs_other
], ignore_index=True)
# dfs = dfs.query("batch_size==32")


## analysis

In [17]:
min_mode = 'each'

df2 = dfs.copy()
if min_mode == 'each':
    min_mse_idx = df2.groupby(['desc', 'data_name', 'seed', 'alpha'])['R2 on test'].idxmax()
    df2 = df2.loc[min_mse_idx]

columns = df2.columns.tolist()
check_columns = ['desc', 'data_name', 'seed', 'alpha', 'R2 on test', 'MAE on test', 'AUROC on test', 'lr', 'batch_size', 'hidden_dim', 'l2_reg', 'w_reg']
columns = check_columns + [c for c in columns if c not in check_columns]
df2 = df2[columns]
df2.rename(columns={'R2 on test': 'R2', 'MAE on test': 'MAE', 'AUROC on test': 'AUROC', 'Pearson on test': 'Pearson'}, inplace=True)
df2['desc'] = df2['desc'].replace({
    'naive': 'Naive', 'sdr': 'SDR', 'mtdr': 'MTDR', 'dr': 'DR', 'cvib': 'CVIB', 'mtips': 'MTIPS', 'ips': 'IPS', 
    'labelwave': 'LabelWave', 'eps_softmax': r'$\epsilon$-Softmax', 'robust_dividemix': 'Robust DivideMix', 'co_teaching': 'Co-Teaching', 'codis': 'CoDis',
    'ome_ips': 'OME-IPS', 'ome_dr': 'OME-DR'
})

dst_order = ['hs', 'ufb', 'saferlhf']
df2 = df2.query("data_name in @dst_order")
df2['data_name'] = pd.Categorical(df2['data_name'], categories=dst_order, ordered=True)

model_order = ['Naive', 'IPS', 'MTIPS', 'CVIB', 'DR', 'MTDR', 'SDR', 'Co-Teaching', 'CoDis', 'LabelWave', r'$\epsilon$-Softmax', 'Robust DivideMix', 'OME-IPS', 'OME-DR']
df2 = df2.query("desc in @model_order")
df2['desc'] = pd.Categorical(df2['desc'], categories=model_order, ordered=True)

df2_avg = df2.groupby(['desc', 'data_name', 'alpha']).mean(numeric_only=True).reset_index()
df2_avg['seed'] = 'Avg'

only_avg = True
# only_avg = False
df2 = df2_avg if only_avg else pd.concat([df2, df2_avg]).reset_index(drop=True)
df2.sort_values(by=['desc', 'data_name', 'alpha'], inplace=True)

# df2.dropna(inplace=True, thresh=5)

df2show = df2.set_index(['data_name', 'alpha', 'desc']).unstack('data_name').swaplevel(axis=1)
columns = []
for model in df2show.columns.levels[0]:
    columns.append((model, 'R2'))
    columns.append((model, 'MAE'))
    columns.append((model, 'Pearson'))
df2show = df2show[columns]


df2show.round(4)

  df2_avg = df2.groupby(['desc', 'data_name', 'alpha']).mean(numeric_only=True).reset_index()


Unnamed: 0_level_0,data_name,hs,hs,hs,ufb,ufb,ufb,saferlhf,saferlhf,saferlhf
Unnamed: 0_level_1,Unnamed: 1_level_1,R2,MAE,Pearson,R2,MAE,Pearson,R2,MAE,Pearson
alpha,desc,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0.5,Naive,0.138,0.3076,0.4275,0.4408,0.2294,0.6659,0.5143,0.2588,0.7664
0.5,IPS,0.1625,0.2995,0.4504,0.4465,0.2498,0.6733,0.5399,0.2687,0.8236
0.5,MTIPS,0.1686,0.3272,0.4905,0.458,0.2737,0.7122,0.5644,0.2462,0.8233
0.5,CVIB,0.1701,0.3477,0.4206,0.4657,0.2485,0.6881,0.5849,0.273,0.8344
0.5,DR,0.1788,0.3475,0.4433,0.4743,0.2525,0.7003,0.629,0.2148,0.8348
0.5,MTDR,0.1857,0.3086,0.4553,0.4824,0.2509,0.7056,0.6559,0.1713,0.8252
0.5,SDR,0.1925,0.3181,,0.4529,0.2437,,0.6873,0.1826,0.8322
0.5,Co-Teaching,0.1441,0.3057,0.4286,0.4532,0.2443,0.6767,0.5328,0.2578,0.7893
0.5,CoDis,0.1555,0.3026,0.4384,0.4584,0.2387,0.6835,0.5472,0.2403,0.799
0.5,LabelWave,0.161,0.3286,0.4224,0.4649,0.2709,0.7139,0.5672,0.235,0.8017


## write to table

In [35]:
contents = []

contents.append(r"\multicolumn{10}{l}{\textit{\textbf{Debias-based Methods}}} \\")
for model in ['Naive', 'IPS', 'MTIPS', 'CVIB', 'DR', 'MTDR', 'SDR']:
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)
contents.append(r"\hdashline")

contents.append(r"\multicolumn{10}{l}{\textit{\textbf{Denoise-based Methods}}} \\")
for model in ['Co-Teaching', 'CoDis', 'LabelWave', r'$\epsilon$-Softmax', 'Robust DivideMix']:
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)
contents.append(r"\hdashline")

for model in ['OME-IPS', 'OME-DR']:
    contents.append(r"\rowcolor[HTML]{ecf0ff}")
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)

print("\n".join(contents))

\multicolumn{10}{l}{\textit{\textbf{Debias-based Methods}}} \\
Naive & 0.1380 & 0.3076 & 0.4275 & 0.4408 & 0.2294 & 0.6659 & 0.5143 & 0.2588 & 0.7664 \\
IPS & 0.1625 & 0.2995 & 0.4504 & 0.4465 & 0.2498 & 0.6733 & 0.5399 & 0.2687 & 0.8236 \\
MTIPS & 0.1686 & 0.3272 & 0.4905 & 0.4580 & 0.2737 & 0.7122 & 0.5644 & 0.2462 & 0.8233 \\
CVIB & 0.1701 & 0.3477 & 0.4206 & 0.4657 & 0.2485 & 0.6881 & 0.5849 & 0.2730 & 0.8344 \\
DR & 0.1788 & 0.3475 & 0.4433 & 0.4743 & 0.2525 & 0.7003 & 0.6290 & 0.2148 & 0.8348 \\
MTDR & 0.1857 & 0.3086 & 0.4553 & 0.4824 & 0.2509 & 0.7056 & 0.6559 & 0.1713 & 0.8252 \\
SDR & 0.1925 & 0.3181 & nan & 0.4529 & 0.2437 & nan & 0.6873 & 0.1826 & 0.8322 \\
\hdashline
\multicolumn{10}{l}{\textit{\textbf{Denoise-based Methods}}} \\
Co-Teaching & 0.1441 & 0.3057 & 0.4286 & 0.4532 & 0.2443 & 0.6767 & 0.5328 & 0.2578 & 0.7893 \\
CoDis & 0.1555 & 0.3026 & 0.4384 & 0.4584 & 0.2387 & 0.6835 & 0.5472 & 0.2403 & 0.7990 \\
LabelWave & 0.1610 & 0.3286 & 0.4224 & 0.4649 & 0.2709 & 0.71

# results of $\rho_{01}=0.2$ and $\rho_{10}=0.1$

In [7]:
df = dfa.query("r10==0.2 and r01==0.1")

## log debias

In [16]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']

columns = ['l2_reg', 'w_reg']
df.query("desc=='naive' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='naive' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='naive' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop']
# df.query("desc=='ips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'w_prop']
# df.query("desc=='mtips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='mtips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='mtips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'w_info', 'w_entropy']
# df.query("desc=='cvib' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='cvib' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='cvib' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_imp', 'w_imp']
# df.query("desc=='dr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='dr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='dr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'w_prop', 'w_imp']
# df.query("desc=='mtdr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='mtdr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='mtdr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_imp', 'w_imp', 'eta']
# df.query("desc=='sdr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='sdr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='sdr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,R2 on test,MAE on test,Pearson on test
7247,1e-05,10.0,0.089505,0.327453,0.368353


## log denoise

In [None]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']


columns = ['l2_reg', 'w_reg', 'forget_rate', 'num_gradual']
df.query("desc=='co_teaching' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='co_teaching' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='co_teaching' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'forget_rate', 'num_gradual', 'co_lambda']
df.query("desc=='codis' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='codis' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='codis' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'lw_k', 'monitor_on']
df.query("desc=='labelwave' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='labelwave' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='labelwave' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]



columns = ['l2_reg', 'w_reg', 'eps', 'alp', 'beta', 'm']
df.query("desc=='eps_softmax' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='eps_softmax' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='eps_softmax' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]


columns = ['l2_reg', 'w_reg', 'p_threshold', 'alpha_mix', 'lambda_u', 'num_steps', 'perturb_step']
df.query("desc=='robust_dividemix' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='robust_dividemix' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
df.query("desc=='robust_dividemix' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,p_threshold,alpha_mix,lambda_u,num_steps,perturb_step,R2 on test,MAE on test,Pearson on test
7574,1e-06,5.0,0.5,4.0,25.0,1.0,0.5,0.740458,0.137388,
7562,1e-06,2.0,0.5,4.0,25.0,1.0,0.5,0.738193,0.148762,
7989,1e-07,0.2,0.5,4.0,10.0,1.0,0.5,0.738132,0.136878,
8019,1e-07,2.0,0.5,4.0,25.0,1.0,0.5,0.734747,0.143497,
7995,1e-07,0.2,0.5,4.0,25.0,1.0,0.5,0.733863,0.13863,
7544,1e-06,1.0,0.5,4.0,10.0,1.0,0.5,0.733412,0.142963,
7532,1e-06,0.2,0.5,4.0,10.0,1.0,0.5,0.73274,0.141983,
7559,1e-06,2.0,0.5,0.5,25.0,1.0,0.5,0.73241,0.120542,
7568,1e-06,5.0,0.5,4.0,10.0,1.0,0.5,0.73208,0.147373,
7541,1e-06,1.0,0.5,0.5,10.0,1.0,0.5,0.731963,0.127577,


## log finetune

In [28]:
metrics = ['R2 on test', 'MAE on test', 'Pearson on test']


columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_noise', 'w_noise', 'quant']
df.query("desc=='ome_ips' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='ome_ips' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics].head(20)
# df.query("desc=='ome_ips' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]



# columns = ['l2_reg', 'w_reg', 'l2_prop', 'w_prop', 'l2_noise', 'w_noise', 'l2_imp', 'w_imp', 'quant']
# df.query("desc=='ome_dr' and data_name=='hs' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ome_dr' and data_name=='ufb' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]
# df.query("desc=='ome_dr' and data_name=='saferlhf' and alpha==0.5").sort_values(by=['R2 on test'], ascending=False)[columns + metrics]

Unnamed: 0,l2_reg,w_reg,l2_prop,w_prop,l2_noise,w_noise,quant,R2 on test,MAE on test,Pearson on test
21431,0.0001,1.0,0.0001,0.1,1e-05,1.0,0.85,-0.021904,0.406406,0.116388
25147,5e-06,1.0,0.0001,0.1,0.0001,2.0,0.85,-0.023221,0.40396,0.406678
21517,0.0001,1.0,1e-05,0.1,0.0,0.1,0.7,-0.045319,0.394089,0.36867
22098,1e-05,1.0,1e-05,0.2,1e-05,1.0,0.97,-0.09002,0.379993,0.314067
21738,1e-05,0.2,1e-05,0.2,1e-05,1.0,0.97,-0.093654,0.378976,0.313849
21956,1e-05,1.0,0.0001,0.1,0.0,2.0,0.85,-0.096067,0.377865,0.3693
21830,1e-05,0.2,1e-07,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
21794,1e-05,0.2,1e-06,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
21758,1e-05,0.2,1e-05,1.0,1e-06,0.2,0.85,-0.103258,0.376131,0.362135
21319,0.0001,1.0,0.0,0.1,1e-05,0.1,0.7,-0.121857,0.371205,0.443474


## selection

In [8]:
dfs = df.copy()
dfs = dfs[dfs.alpha.isin([0.5])]

dfs_naive_hs = dfs.query("desc=='naive' and data_name=='hs' and l2_reg==1e-5 and w_reg==10.0")
dfs_naive_ufb = dfs.query("desc=='naive' and data_name=='ufb' and l2_reg==1e-6 and w_reg==1.0")
dfs_naive_safe = dfs.query("desc=='naive' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0")
dfs_naive = pd.concat([dfs_naive_hs, dfs_naive_ufb, dfs_naive_safe], ignore_index=True)


dfs_ips_hs = dfs.query("desc=='ips' and data_name=='hs' and l2_reg==1e-7 and w_reg==1.0 and l2_prop==1e-7 and w_prop==0.2")
dfs_ips_ufb = dfs.query("desc=='ips' and data_name=='ufb' and l2_reg==1e-7 and w_reg==2.0 and l2_prop==1e-7 and w_prop==2.0")
dfs_ips_safe = dfs.query("desc=='ips' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-5 and w_prop==0.2")
dfs_ips = pd.concat([dfs_ips_hs, dfs_ips_ufb, dfs_ips_safe], ignore_index=True)


dfs_mtips_hs = dfs.query("desc=='mtips' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.2 and w_prop==2.0")
dfs_mtips_ufb = dfs.query("desc=='mtips' and data_name=='ufb' and l2_reg==1e-6 and w_reg==2.0 and w_prop==0.2")
dfs_mtips_safe = dfs.query("desc=='mtips' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==2.0 and w_prop==0.2")
dfs_mtips = pd.concat([dfs_mtips_hs, dfs_mtips_ufb, dfs_mtips_safe], ignore_index=True)


dfs_cvib_hs = dfs.query("desc=='cvib' and data_name=='hs' and l2_reg==1e-7 and w_reg==10.0 and w_info==0.5 and w_entropy==1.0")
dfs_cvib_ufb = dfs.query("desc=='cvib' and data_name=='ufb' and l2_reg==5e-6 and w_reg==8.0 and w_info==0.2 and w_entropy==0.2")
dfs_cvib_safe = dfs.query("desc=='cvib' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==15.0 and w_info==0.1 and w_entropy==2.0")
dfs_cvib = pd.concat([dfs_cvib_hs, dfs_cvib_ufb, dfs_cvib_safe], ignore_index=True)


dfs_dr_hs = dfs.query("desc=='dr' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.1 and l2_prop==1e-5 and w_prop==0.2 and l2_imp==1e-6 and w_imp==1.0")
dfs_dr_ufb = dfs.query("desc=='dr' and data_name=='ufb' and l2_reg==1e-6 and w_reg==2.0 and l2_prop==1e-7 and w_prop==2.0 and l2_imp==1e-6 and w_imp==2.0")
dfs_dr_safe = dfs.query("desc=='dr' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==1.0 and l2_prop==1e-6 and w_prop==1.0 and l2_imp==1e-7 and w_imp==2.0")
dfs_dr = pd.concat([dfs_dr_hs, dfs_dr_ufb, dfs_dr_safe], ignore_index=True)


dfs_mtdr_hs = dfs.query("desc=='mtdr' and data_name=='hs' and l2_reg==1e-5 and w_reg==2.0 and w_prop==0.2 and w_imp==0.2")
dfs_mtdr_ufb = dfs.query("desc=='mtdr' and data_name=='ufb' and l2_reg==5e-5 and w_reg==1.0 and w_prop==0.2 and w_imp==0.1")
dfs_mtdr_safe = dfs.query("desc=='mtdr' and data_name=='saferlhf' and l2_reg==1e-5 and w_reg==2.0 and w_prop==2.0 and w_imp==1.0")
dfs_mtdr = pd.concat([dfs_mtdr_hs, dfs_mtdr_ufb, dfs_mtdr_safe], ignore_index=True)


dfs_sdr_hs = dfs.query("desc=='sdr' and data_name=='hs' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-7 and w_prop==1.0 and l2_imp==1e-7 and w_imp==2.0 and eta==1.0")
dfs_sdr_ufb = dfs.query("desc=='sdr' and data_name=='ufb' and l2_reg==1e-7 and w_reg==2.0 and l2_prop==1e-7 and w_prop==0.2 and l2_imp==1e-7 and w_imp==2.0 and eta==1.0")
dfs_sdr_safe = dfs.query("desc=='sdr' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and l2_prop==1e-6 and w_prop==1.0 and l2_imp==1e-6 and w_imp==1.0 and eta==1.0")
dfs_sdr = pd.concat([dfs_sdr_hs, dfs_sdr_ufb, dfs_sdr_safe], ignore_index=True)


dfs_cotea_hs = dfs.query("desc=='co_teaching' and data_name=='hs' and l2_reg==1e-6 and w_reg==0.2 and forget_rate==0.2 and num_gradual==10")
dfs_cotea_ufb = dfs.query("desc=='co_teaching' and data_name=='ufb' and l2_reg==2e-6 and w_reg==2.0 and forget_rate==0.2 and num_gradual==10")
dfs_cotea_safe = dfs.query("desc=='co_teaching' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==0.2 and forget_rate==0.2 and num_gradual==10")
dfs_cotea = pd.concat([dfs_cotea_hs, dfs_cotea_ufb, dfs_cotea_safe], ignore_index=True)


dfs_codis_hs = dfs.query("desc=='codis' and data_name=='hs' and l2_reg==5e-6 and w_reg==0.1 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.1")
dfs_codis_ufb = dfs.query("desc=='codis' and data_name=='ufb' and l2_reg==2e-6 and w_reg==15.0 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.2")
dfs_codis_safe = dfs.query("desc=='codis' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==2.0 and forget_rate==0.2 and num_gradual==10 and co_lambda==0.1")
dfs_codis = pd.concat([dfs_codis_hs, dfs_codis_ufb, dfs_codis_safe], ignore_index=True)


dfs_lw_hs = dfs.query("desc=='labelwave' and data_name=='hs' and l2_reg==1e-5 and w_reg==1.0 and lw_k==10 and monitor_on=='loss'")
dfs_lw_ufb = dfs.query("desc=='labelwave' and data_name=='ufb' and l2_reg==1e-5 and w_reg==0.2 and lw_k==5 and monitor_on=='acc'")
dfs_lw_safe = dfs.query("desc=='labelwave' and data_name=='saferlhf' and l2_reg==1e-7 and w_reg==0.2 and lw_k==3 and monitor_on=='loss'")
dfs_lw = pd.concat([dfs_lw_hs, dfs_lw_ufb, dfs_lw_safe], ignore_index=True)


dfs_eps_hs = dfs.query("desc=='eps_softmax' and data_name=='hs' and l2_reg==1e-5 and w_reg==1.0 and eps==1e-8 and alp==0.05 and beta==0.1 and m==1e2")
dfs_eps_ufb = dfs.query("desc=='eps_softmax' and data_name=='ufb' and l2_reg==1e-5 and w_reg==1.0 and eps==1e-8 and alp==0.05 and beta==0.1 and m==1e3")
dfs_eps_safe = dfs.query("desc=='eps_softmax' and data_name=='saferlhf' and l2_reg==1e-5 and w_reg==0.5 and eps==1e-8 and alp==0.02 and beta==1.0 and m==1e2")
dfs_eps = pd.concat([dfs_eps_hs, dfs_eps_ufb, dfs_eps_safe], ignore_index=True)


dfs_rob_hs = dfs.query("desc=='robust_dividemix' and data_name=='hs' and l2_reg==1e-6 and w_reg==1.0 and p_threshold==0.5 and alpha_mix==0.5 and lambda_u==25 and num_steps==1 and perturb_step==0.5")
dfs_rob_ufb = dfs.query("desc=='robust_dividemix' and data_name=='ufb' and l2_reg==1e-7 and w_reg==0.2 and p_threshold==0.5 and alpha_mix==4 and lambda_u==10 and num_steps==1 and perturb_step==0.5")
dfs_rob_safe = dfs.query("desc=='robust_dividemix' and data_name=='saferlhf' and l2_reg==1e-6 and w_reg==1.0 and p_threshold==0.5 and alpha_mix==0.5 and lambda_u==25 and num_steps==1 and perturb_step==0.1")
dfs_rob = pd.concat([dfs_rob_hs, dfs_rob_ufb, dfs_rob_safe], ignore_index=True)


dfs_other = dfs.query("desc not in ['naive', 'ips', 'mtips', 'cvib', 'dr', 'mtdr', 'sdr', 'co_teaching', 'codis', 'labelwave', 'eps_softmax', 'robust_dividemix']")
dfs = pd.concat([
    dfs_naive, 
    dfs_ips,
    dfs_mtips,
    dfs_cvib,
    dfs_dr,
    dfs_mtdr,
    dfs_sdr,
    dfs_cotea,
    dfs_codis,
    dfs_lw,
    dfs_eps,
    dfs_rob,
    dfs_other
], ignore_index=True)
# dfs = dfs.query("batch_size==32")


## analysis

In [9]:
min_mode = 'each'

df2 = dfs.copy()
if min_mode == 'each':
    min_mse_idx = df2.groupby(['desc', 'data_name', 'seed', 'alpha'])['R2 on test'].idxmax()
    df2 = df2.loc[min_mse_idx]

columns = df2.columns.tolist()
check_columns = ['desc', 'data_name', 'seed', 'alpha', 'R2 on test', 'MAE on test', 'AUROC on test', 'lr', 'batch_size', 'hidden_dim', 'l2_reg', 'w_reg']
columns = check_columns + [c for c in columns if c not in check_columns]
df2 = df2[columns]
df2.rename(columns={'R2 on test': 'R2', 'MAE on test': 'MAE', 'AUROC on test': 'AUROC', 'Pearson on test': 'Pearson'}, inplace=True)
df2['desc'] = df2['desc'].replace({
    'naive': 'Naive', 'sdr': 'SDR', 'mtdr': 'MTDR', 'dr': 'DR', 'cvib': 'CVIB', 'mtips': 'MTIPS', 'ips': 'IPS', 
    'labelwave': 'LabelWave', 'eps_softmax': r'$\epsilon$-Softmax', 'robust_dividemix': 'Robust DivideMix', 'co_teaching': 'Co-Teaching', 'codis': 'CoDis',
    'ome_ips': 'OME-IPS', 'ome_dr': 'OME-DR'
})

dst_order = ['hs', 'ufb', 'saferlhf']
df2 = df2.query("data_name in @dst_order")
df2['data_name'] = pd.Categorical(df2['data_name'], categories=dst_order, ordered=True)

model_order = ['Naive', 'IPS', 'MTIPS', 'CVIB', 'DR', 'MTDR', 'SDR', 'Co-Teaching', 'CoDis', 'LabelWave', r'$\epsilon$-Softmax', 'Robust DivideMix', 'OME-IPS', 'OME-DR']
df2 = df2.query("desc in @model_order")
df2['desc'] = pd.Categorical(df2['desc'], categories=model_order, ordered=True)

df2_avg = df2.groupby(['desc', 'data_name', 'alpha']).mean(numeric_only=True).reset_index()
df2_avg['seed'] = 'Avg'

only_avg = True
# only_avg = False
df2 = df2_avg if only_avg else pd.concat([df2, df2_avg]).reset_index(drop=True)
df2.sort_values(by=['desc', 'data_name', 'alpha'], inplace=True)

# df2.dropna(inplace=True, thresh=5)

df2show = df2.set_index(['data_name', 'alpha', 'desc']).unstack('data_name').swaplevel(axis=1)
columns = []
for model in df2show.columns.levels[0]:
    columns.append((model, 'R2'))
    columns.append((model, 'MAE'))
    columns.append((model, 'Pearson'))
df2show = df2show[columns]


df2show.round(4)

  df2_avg = df2.groupby(['desc', 'data_name', 'alpha']).mean(numeric_only=True).reset_index()


Unnamed: 0_level_0,data_name,hs,hs,hs,ufb,ufb,ufb,saferlhf,saferlhf,saferlhf
Unnamed: 0_level_1,Unnamed: 1_level_1,R2,MAE,Pearson,R2,MAE,Pearson,R2,MAE,Pearson
alpha,desc,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0.5,Naive,0.0895,0.3275,0.3684,0.3597,0.3033,0.6158,0.6276,0.2149,0.799
0.5,IPS,0.081,0.3212,0.3723,0.2808,0.3165,0.6436,0.6727,0.201,0.8279
0.5,MTIPS,0.2376,0.3446,0.4917,0.436,0.2898,0.6976,0.6319,0.2574,0.8386
0.5,CVIB,-0.0203,0.3544,0.3478,0.3304,0.3104,0.6147,0.5885,0.2653,0.8288
0.5,DR,0.135,0.3605,0.4349,0.3309,0.295,0.5943,0.6901,0.2081,0.8455
0.5,MTDR,0.1371,0.3392,0.4228,0.3728,0.2964,0.6417,0.6097,0.1992,0.781
0.5,SDR,,,,,,,,,
0.5,Co-Teaching,0.127,0.3343,0.4055,0.4169,0.2911,0.68,0.616,0.2184,0.7992
0.5,CoDis,0.1009,0.3151,0.3923,0.3256,0.3185,0.672,0.6526,0.2192,0.8209
0.5,LabelWave,0.1877,0.3844,0.5276,0.4122,0.3098,0.7042,0.7083,0.2061,0.8619


## write to table

In [None]:
contents = []

contents.append(r"\multicolumn{10}{l}{\textit{\textbf{Debias-based Methods}}} \\")
for model in ['Naive', 'IPS', 'MTIPS', 'CVIB', 'DR', 'MTDR', 'SDR']:
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)
contents.append(r"\hdashline")

contents.append(r"\multicolumn{10}{l}{\textit{\textbf{Denoise-based Methods}}} \\")
for model in ['Co-Teaching', 'CoDis', 'LabelWave', r'$\epsilon$-Softmax', 'Robust DivideMix']:
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)
contents.append(r"\hdashline")

for model in ['OME-IPS', 'OME-DR']:
    contents.append(r"\rowcolor[HTML]{ecf0ff}")
    line = f"{model} "
    for data_name in ['hs', 'ufb', 'saferlhf']:
        df_ = df2.query(f"desc=='{model}' and data_name=='{data_name}'")
        line += f"& {df_['R2'].mean():.4f} & {df_['MAE'].mean():.4f} & {df_['Pearson'].mean():.4f} "
    line += r"\\"
    contents.append(line)

print("\n".join(contents))

\multicolumn{10}{l}{\textit{\textbf{Debias-based Methods}}} \\
Naive & 0.1380 & 0.3076 & 0.4275 & 0.4408 & 0.2294 & 0.6659 & 0.5143 & 0.2588 & 0.7664 \\
IPS & 0.1625 & 0.2995 & 0.4504 & 0.4465 & 0.2498 & 0.6733 & 0.5399 & 0.2687 & 0.8236 \\
MTIPS & 0.1686 & 0.3272 & 0.4905 & 0.4580 & 0.2737 & 0.7122 & 0.5644 & 0.2462 & 0.8233 \\
CVIB & 0.1701 & 0.3477 & 0.4206 & 0.4657 & 0.2485 & 0.6881 & 0.5849 & 0.2730 & 0.8344 \\
DR & 0.1788 & 0.3475 & 0.4433 & 0.4743 & 0.2525 & 0.7003 & 0.6290 & 0.2148 & 0.8348 \\
MTDR & 0.1857 & 0.3086 & 0.4553 & 0.4824 & 0.2509 & 0.7056 & 0.6559 & 0.1713 & 0.8252 \\
SDR & 0.1925 & 0.3181 & nan & 0.4529 & 0.2437 & nan & 0.6873 & 0.1826 & 0.8322 \\
\hdashline
\multicolumn{10}{l}{\textit{\textbf{Denoise-based Methods}}} \\
Co-Teaching & 0.1441 & 0.3057 & 0.4286 & 0.4532 & 0.2443 & 0.6767 & 0.5328 & 0.2578 & 0.7893 \\
CoDis & 0.1555 & 0.3026 & 0.4384 & 0.4584 & 0.2387 & 0.6835 & 0.5472 & 0.2403 & 0.7990 \\
LabelWave & 0.1610 & 0.3286 & 0.4224 & 0.4649 & 0.2709 & 0.71