In [1]:
import os
from rosemary import jpt_setup; jpt_setup()

import matplotlib.pyplot as plt


from transformers.trainer_utils import get_last_checkpoint

import re
from llm.submit import (
    multiline_to_singleline,
    submit_job_ccc,
    submit_job_aimos,
    submit_job,
        get_run_statistics)
import pandas as pd
pd.set_option('display.max_colwidth', None)

import numpy as np
import json
import tempfile
import subprocess
import shlex
import datetime
import itertools
import glob

import base64
string_to_alphanumeric = lambda s: base64.urlsafe_b64encode(s.encode('utf-8')).decode('utf-8')
alphanumeric_to_string = lambda a: base64.urlsafe_b64decode(a).decode('utf-8')

from llm.submit import shell_scripts_template_slurm, shell_scripts_template_lsf, get_host_info

info = get_host_info()
arch, cluster = info['arch'], info['cluster']
print(info)

os.environ['TORCHELASTIC_ERROR_FILE'] = os.path.join(os.getcwd(), 'torchelastic_error_file') 

## jobs submitted in notebook inherits env variables.
cache_dir = '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/cache'
os.environ['WANDB_DIR'] = cache_dir
os.makedirs(os.environ['WANDB_DIR'], exist_ok=True)
os.environ['WANDB_MODE'] = 'offline'
os.environ['WANDB_PROJECT'] = 'mitibm'
##
##

shell_scripts_template = shell_scripts_template_slurm \
    if arch == 'ppc64le' else shell_scripts_template_lsf

  warn(f'Install `torch` for functionalities dependent on torch')


{'arch': 'ppc64le', 'cluster': 'dcs'}


# DPO

In [8]:
from llm.submit import shell_scripts_template_slurm
debug = False
if debug:
    os.environ['TORCH_CPP_LOG_LEVEL'] = 'INFO'
    os.environ['NCCL_DEBUG'] = 'INFO'
else:
    os.environ['TORCH_CPP_LOG_LEVEL'] = 'WARNING'
    os.environ['NCCL_DEBUG'] = ''
num_cpus = 144 if arch == 'ppc64le' else 32
cpu_mem =  650 if arch == 'ppc64le' else 64

preprocessing_num_workers = 32
report_to = 'wandb'
mixed_precision = 'bf16' if arch == 'x86_64' else 'fp16'
torch_dtype = 'bfloat16' if arch=='x86_64' else 'float32'
gradient_checkpointing = True
use_fast_tokenizer = True
hf_models_dir = 'results/baselines/'
resume_from_checkpoint = None
num_train_epochs = 2
checkpointing_steps = 500 # (50_000 / 32) * 2 / 6 ~= 500 (data size of 50k, bsz=32, ep=2, total save 6 times at most)


#####
job_name = 'dpo1'

# model_name_or_path = hf_models_dir+'huggyllama/llama-7b'; abbr_model_name = 'llama-7b'; max_seq_length = 2048
# model_name_or_path = hf_models_dir+'EleutherAI/pythia-410m-deduped'; max_seq_length = 2048; abbr_model_name = 'pythia-410m'
model_name_or_path = 'results/oi2/llama-7b_sharegptv2_ep=2'; max_seq_length = 2048; abbr_model_name = 'llama7b+sharegptv2ep2'

train_file = 'data/processed/ultrafeedback/ultrafeedback_data.jsonl'; abbr_train_file = 'ultrafeedback'

#####


test_run = 0
test_run = bool(test_run)

nodes = 5; num_gpus = 6; gpu_type = 'v100'; job_duration = 6
# nodes = 1; num_gpus = 2; gpu_type = 'v100'; job_duration = 6
# nodes = 2; num_gpus = 1; gpu_type = 'v100'; job_duration = 6; cpu_mem = 100; num_cpus = 32
# nodes = 1; num_gpus = 2; gpu_type = 'v100'; job_duration = 6; cpu_mem = 100; num_cpus = 32


#####

if not os.path.isfile(train_file):
    print(f'train_file={train_file} does not exists')
    
report_to = 'tensorboard' if test_run else report_to
report_to = None

use_deepspeed = True
deepspeed_config_file = 'ds_configs/stage3_no_offloading_accelerate.conf'

per_device_train_batch_size = 1; total_batch_size = 32
gradient_accumulation_steps = round(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)
effective_batch_size = per_device_train_batch_size*nodes*num_gpus*gradient_accumulation_steps

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{per_device_train_batch_size} batch size per GPU, "
      f"{gradient_accumulation_steps} gradient accumulation steps, "
      f"{effective_batch_size} effective batch size.")

# https://gist.github.com/pacman100/1cb1f17b2f1b3139a63b764263e70b25
# need to escape $SLURM_PROCID to make `srun` evaluate the variable for each task
launcher = f"""accelerate launch \
    --mixed_precision {mixed_precision} \
    --num_machines {nodes} \
    --num_processes {num_gpus*nodes} \
    {'--use_deepspeed' if use_deepspeed else ''} \
    {'--deepspeed_config_file '+deepspeed_config_file if use_deepspeed else ''} \
    {'--main_process_ip $master_addr' if use_deepspeed else ''} \
    {'--main_process_port $master_port' if use_deepspeed else ''} \
    {'--machine_rank $SLURM_PROCID' if use_deepspeed else ''} \
    {'--rdzv_backend c10d' if use_deepspeed and nodes>1 else ''} \
    {'--deepspeed_multinode_launcher standard' if use_deepspeed and nodes>1 else ''} \
"""

#     {'--same_network' if use_deepspeed else ''} \
#     {'--deepspeed_hostfile $HOSTFILE_PATH' if use_deepspeed and nodes>1 else ''} \
# if use_deepspeed and nodes > 1:
#     launcher = f'HOSTFILE_PATH=$(create_hostfile {8 if arch=="x86_64" else 1}) ' + launcher

cmds = []


output_dirname = f"{abbr_model_name}_{abbr_train_file}"
if any(job_name == y for y in ['dpo1']):
    output_dirname += f'_ep={num_train_epochs}'
if test_run:
    output_dirname = 'jpt_'+output_dirname
output_dir = os.path.join('results', job_name, output_dirname)
os.makedirs(os.path.join('results', job_name), exist_ok=True)
wandb_run_name = output_dir.replace('results/', '')


cmd = f"""
{f'cd .. && CUDA_VISIBLE_DEVICES={os.environ["CUDA_VISIBLE_DEVICES"]} ' if test_run else ''}{launcher}
    open_instruct/dpo_tune.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    {'--use_slow_tokenizer' if not  use_fast_tokenizer else ''} \
    {'--gradient_checkpointing' if gradient_checkpointing  else ''} \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    --preprocessing_num_workers {preprocessing_num_workers} \
    --per_device_train_batch_size {per_device_train_batch_size} \
    --gradient_accumulation_steps {gradient_accumulation_steps} \
    --learning_rate 5e-7 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.1 \
    --weight_decay 0. \
    --num_train_epochs {num_train_epochs} \
    --with_tracking \
    {'--report_to "'+str(report_to)+'"' if report_to else ''} \
    --checkpointing_steps {checkpointing_steps} \
    {'--resume_from_checkpoint '+str(resume_from_checkpoint) if resume_from_checkpoint else ''} \
    {'--low_cpu_mem_usage' if not use_deepspeed else ''} \
    --logging_steps 1 \
    --output_dir {output_dir}
"""
# if test_run:
#     print('\n'+' \\\n\t'.join([x.strip() for x in re.split(r'\s{3,}', cmd)]))
    
cmd = multiline_to_singleline(cmd)
cmds.append(cmd)

if test_run:
    print(cmd)

shell_scripts = shell_scripts_template.format(
    conda_env='open-instruct',
    cwd=os.path.dirname(os.getcwd()),
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir
)
out = submit_job(
    shell_scripts, 
    job_name=job_name, 
    nodes=nodes,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    num_gpus=num_gpus,
    gpu_type=gpu_type,
    test_run=test_run,
    job_duration=job_duration,
)
if not test_run:
    print(out)

Training results/oi2/llama-7b_sharegptv2_ep=2 using 6 GPUs, 1 batch size per GPU, 1 gradient accumulation steps, 30 effective batch size.

Submiting job with:
{
    "job_name": "dpo1",
    "nodes": 5,
    "num_cpus": 144,
    "cpu_mem": 650,
    "num_gpus": 6,
    "gpu_type": "v100",
    "test_run": false,
    "queue": "el8",
    "num_jobs": 1
}
[{'args': 'sbatch --job-name=dpo1 --partition=el8 --nodes=5 --ntasks-per-node=1 --cpus-per-task=144 --mem=650GB --gres=gpu:6 --output=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/%J.out --time=6:00:00 /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/tmp1jfsv5t3', 'job_id': 1339624}]


In [None]:
print(cmd)

In [90]:
with open('gen_cmds_dpo.sh', 'w') as f:
    s = 'set -e\nset -x\n'
    devices = os.environ['CUDA_VISIBLE_DEVICES'].split(',')[-1]
    s += '\n\n'.join([f"CUDA_VISIBLE_DEVICES={devices} "+x for x in cmds])
    f.write(s)

In [42]:
!bash gen_cmds_dpo.sh

+ CUDA_VISIBLE_DEVICES=5
+ cd ..
+ CUDA_VISIBLE_DEVICES=2,5
+ accelerate launch --mixed_precision fp16 --num_machines 1 --num_processes 2 --use_deepspeed --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate.conf open_instruct/dpo_tune.py --model_name_or_path results/baselines/huggyllama/llama-7b --tokenizer_name results/baselines/huggyllama/llama-7b --gradient_checkpointing --train_file data/processed/ultrafeedback/ultrafeedback_data.jsonl --max_seq_length 2048 --preprocessing_num_workers 32 --per_device_train_batch_size 1 --gradient_accumulation_steps 16 --learning_rate 5e-7 --lr_scheduler_type linear --warmup_ratio 0.1 --weight_decay 0. --num_train_epochs 2 --with_tracking --report_to tensorboard --checkpointing_steps 500 --logging_steps 1 --output_dir results/dpo1/jpt_llama-7b_ultrafeedback
[2024-01-08 20:41:08,547] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
The following values were not passed to `accelerate launch` a

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32001. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
01/08/2024 20:42:20 - INFO - accelerate.accelerator - Updating DeepSpeed's gradient accumulation steps to 16 from 1.
[2024-01-08 20:42:20,382] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.10.1+23a11a39, git-hash=23a11a39, git-branch=master
01/08/2024 20:42:20 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:2 to store for rank: 0
01/08/2024 20:42:20 - INFO - torch.distributed.distributed_c10d - Added key: store_based_barrier_key:2 to store for rank: 1
01/08/2024 20:42:20 - INFO - torch.distri

ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 409110) of binary: /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/bin/python3.10
Traceback (most recent call last):
  File "/gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/bin/accelerate", line 8, in <module>
    sys.exit(main())
  File "/gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/accelerate/commands/accelerate_cli.py", line 45, in main
    args.func(args)
  File "/gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/accelerate/commands/launch.py", line 964, in launch_command
    deepspeed_launcher(args)
  File "/gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/accelerate/commands/launch.py", line 687, in deepspeed_launcher
    distrib_run.run(args)
  File "/gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-package

# Finetuning with openinstruct/finetune_trainer.py


In [12]:
add_hardwarespec_to_dirname = False
num_cpus = 144 if arch == 'ppc64le' else 32
cpu_mem =  650 if arch == 'ppc64le' else 64

save_strategy = 'steps'
save_steps = 100
save_total_limit = 1
preprocessing_num_workers = 32
evaluation_strategy = 'no' # set do_eval=False
eval_steps = save_steps
report_to = 'tensorboard wandb'
suffix = None
lr_scheduler_type = 'linear'
warmup_ratio = 0.03
dataloader_sampler = None
hf_models_dir = 'results/baselines/'
subsample_inds_file_list = [None]
max_train_samples_list = [None]
num_train_epochs_list = [1]


############
# oi5


# ###### 150k(ep=3) baselines on various baselines
# model_name_or_path = hf_models_dir+'huggyllama/llama-7b'; abbr_model_name = 'llama-7b'; max_seq_length = 2048
# M = 150_000; dataset = 'tulu_v1_mix'; train_file = 'data/processed/tulu/tulu_v1_mix.jsonl'; abbr_train_file = 'tuluv1m'; pacing_fn_list = [f'prune_size={M}_ep=3']
# # M = 50_000; dataset = 'tulu_v1_mix'; train_file = 'data/processed/tulu/tulu_v1_mix.jsonl'; abbr_train_file = 'tuluv1m'; pacing_fn_list = [f'prune_size={M}_ep=5'] 


# # M = 150_000; dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm'
# # M = 150_000; dataset = 'sharegpt'; train_file = 'data/processed/sharegpt/sharegpt_data.jsonl'; abbr_train_file = 'sharegpt'
# # M = 150_000; dataset = 'ultrachat200k'; train_file = 'data/processed/ultrachat/ultrachat200k_train_data.jsonl'; abbr_train_file = 'ultrachat200k'

# scoring_fn_list = []
# # scoring_fn_list += ['random_s=0', 'random_s=1']
# # scoring_fn_list += ['dppmap_k=vmf_gamma=0.0315_kmd=mpnet']
# # scoring_fn_list += ['dppmap_k=rbf_gamma=auto10000_kmd=llama7b_kemb=text+embedding']
# scoring_fn_list = ['log_pmi_neg', 'ifd_neg']
# # pacing_fn_list = [
# # #     f'prune_size={M}_ep=3',
# #     f'singlestep_size={M}_startingfrac=0.05',
# # #     f'singlestep_size={M}_startingfrac=0.1',
# # #     f'singlestep_size={M}_startingfrac=0.2',
# #     f'fep_size={M}_nsteps=5_startingfrac=0.05_inc=1.5',
# # ]
# scoring_fn_and_pacing_fn = list(itertools.product(scoring_fn_list, pacing_fn_list))
# ######

# ##### codellama on starcoder. 5k subset ep=5
# model_name_or_path = hf_models_dir+'codellama/CodeLlama-7b-hf'; abbr_model_name = 'codellama-7b'; max_seq_length = 2048
# # M = 25_000; dataset='starcoder_commentinstr'; train_file = 'data/processed/starcoder/starcoder_commentinstr.jsonl'; abbr_train_file = 'starcodercmtinstr'; pacing_fn_list = [f'prune_size={M}_ep=5']; subset_size = 5_000
# # M = 25_000; dataset='starcoder_commentinstr_cleaned'; train_file = 'data/processed/starcoder/starcoder_commentinstr_cleaned.jsonl'; abbr_train_file = 'starcodercmtinstrcleaned'; pacing_fn_list = [f'prune_size={M}_ep=5'] ; subset_size = 5_000
# # M = 50_000; dataset='starcoder_commentinstrv2'; train_file = 'data/processed/starcoder/starcoder_commentinstrv2.jsonl'; abbr_train_file = 'starcodercmtinstrv2'; pacing_fn_list = [f'prune_size={M}_ep=5'] ; subset_size = 10_000
# M = 50_000; dataset='starcoder_commentinstrv5'; train_file = 'data/processed/starcoder/starcoder_commentinstrv5.jsonl'; abbr_train_file = 'starcodercmtinstrv5'; pacing_fn_list = [f'prune_size={M}_ep=5'] ; subset_size = 10_000
# # M = 60_000; dataset='starcoder_commentinstrv5'; train_file = 'data/processed/starcoder/starcoder_commentinstrv5.jsonl'; abbr_train_file = 'starcodercmtinstrv5'; pacing_fn_list = [f'prune_size={M}_ep=3'] ; subset_size = 20_000

# scoring_fn_list = []
# scoring_fn_list += ['log_prob_neg', 'el2n_agg=mean', 'grad_loraB_l2n', 'ifd_neg', 'log_pmi_neg']

# # scoring_fn_list += ['log_pmi_neg']
# # scoring_fn_list += ['random_s=0', 'random_s=1']
# # scoring_fn_list += ['dedup_md=mpnet_emb=text+embedding']

# # scoring_fn_list += [
# #     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=mpnet',
# #     f'dppmap_k=rbf_gamma=auto{subset_size}_kmd=llama7b_kemb=text+embedding',
# #     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=codellama7b_kemb=grad+rp+loraB',
# # ]

# scoring_fn_and_pacing_fn = list(itertools.product(scoring_fn_list, pacing_fn_list))
# #####

# ##### 
# model_name_or_path = hf_models_dir+'huggyllama/llama-7b'; abbr_model_name = 'llama-7b'; max_seq_length = 2048
# # M = 150_000; dataset = 'tulu_v1_mix'; train_file = 'data/processed/tulu/tulu_v1_mix.jsonl'; abbr_train_file = 'tuluv1m'; pacing_fn_list = [f'prune_size={M}_ep=3']
# # M = 200_000; dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm'; pacing_fn_list = [f'prune_size={M}_ep=2']
# # M = 100_000; dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm'; pacing_fn_list = [f'prune_size={M}_ep=2'] # 6hr gpu ok.
# M = 50_000; dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm'; pacing_fn_list = [f'prune_size={M}_ep=5'] # 6hr gpu ok.
# # M = 10_000; dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm'; pacing_fn_list = [f'prune_size={M}_ep=10'] # 6hr gpu ok.


# scoring_fn_list = []
# # scoring_fn_list += ['log_prob_neg', 'el2n_agg=mean', 'grad_loraB_l2n'] # 'numtoks_input_neg'
# # scoring_fn_list += ['ifd', 'ifd_neg', 'log_pmi', 'log_pmi_neg']
# # scoring_fn_list += ['dedup_md=mpnet_emb=text+embedding']
# # scoring_fn_list += ['semdedup_cl=kmeansfaisscd_md=mpnet_dist=cd_emb=text+embedding_nc=200',
# # #                     'semdedup_cl=kmeansfaisscd_md=bge_dist=cd_emb=text+embedding_nc=200'
# #                    ]
# # scoring_fn_list = ['semdedup_cl=kmeansfaisscd_md=mpnet_dist=cd_emb=text+embedding_nc=200',
# #                    'semdedup_cl=kmeansfaisscd_md=bge_dist=cd_emb=text+embedding_nc=200',
# #                    'semdedup_cl=kmeansfaisscd_md=llama7b_dist=cd_emb=text+embedding_nc=200',
# #                    'semdedup_cl=kmeansfaisscd_md=llama7b_dist=cd_emb=grad+rp+loraB_nc=200',]
# # scoring_fn_list += ['random_s=0', 'random_s=1']
# scoring_fn_list = [ 
#     # ->1k
# #     'dppmap:k=lin:kmd=mpnet',
# #     'dppmap_k=vmf_gamma=auto1000_kmd=bge'
# #     'dppmap_k=vmf_gamma=0.008_kmd=mpnet',
# #     'dppmap_theta=0.5_k=vmf_gamma=0.01_kmd=mpnet_q=log+pmi_qmd=llama7b',
# #     'dppmap_theta=0.3_k=vmf_gamma=0.01_kmd=mpnet_q=log+pmi_qmd=llama7b',
# #     'dppmap_k=rbf_gamma=0.0000007_kmd=llama7b_kemb=grad+rp+loraB',
# #     'dppmap_k=rbf_gamma=3.5e-8_kmd=llama7b_kemb=text+embedding',
# #     'dppmap_k=vmf_gamma=auto1000_kmd=llama7b_kemb=text+embedding',
# #     'dppmap_k=vmf_gamma=auto1000_kmd=llama7b_kemb=grad+rp+loraB',
#     # ->10k
# #     'dppmap_k=vmf_gamma=auto10000_kmd=bge'
# #     'dppmap_theta=0.5_k=vmf_gamma=0.043_kmd=mpnet_q=log+pmi_qmd=llama7b',
# #     'dppmap_theta=0.7_k=vmf_gamma=0.043_kmd=mpnet_q=log+pmi_qmd=llama7b',
# #     'dppmap_theta=0.3_k=vmf_gamma=0.04_kmd=mpnet_q=ifd_qmd=llama7b',
# #     'dppmap_theta=0.3_k=vmf_gamma=0.043_kmd=mpnet_q=ifd+neg_qmd=llama7b',
# #     'dppmap_k=rbf_gamma=7.5e-06_kmd=llama7b_kemb=text+embedding',
# #     'dppmap_k=rbf_gamma=0.001_kmd=llama7b_kemb=grad+rp+loraB',
# #     'dppmap_k=vmf_gamma=auto10000_kmd=llama7b_kemb=text+embedding',
# #     'dppmap_k=vmf_gamma=auto10000_kmd=llama7b_kemb=grad+rp+loraB',
#     'dppmap_theta=0.1_k=rbf_gamma=auto10000_kmd=llama7b_kemb=text+embedding_q=log+pmi_qmd=llama7b',
#     'dppmap_theta=0.05_k=rbf_gamma=auto10000_kmd=llama7b_kemb=text+embedding_q=log+pmi_qmd=llama7b',
# #     'dppmap_theta=0.3_k=rbf_gamma=auto10000_kmd=llama7b_kemb=text+embedding_q=log+pmi_qmd=llama7b',
# #     'dppmap_theta=0.5_k=rbf_gamma=auto10000_kmd=llama7b_kemb=text+embedding_q=log+pmi_qmd=llama7b',
# ]
# scoring_fn_and_pacing_fn = list(itertools.product(scoring_fn_list, pacing_fn_list))
# ####

##### sharegptv2, open_orca_slim
# model_name_or_path = hf_models_dir+'NousResearch/Llama-2-7b-hf'; abbr_model_name = 'llama2-7b'; max_seq_length = 2048 # 4096
# model_name_or_path = hf_models_dir+'NousResearch/Llama-2-13b-hf'; abbr_model_name = 'llama2-7b'; max_seq_length = 2048 # 4096
model_name_or_path = hf_models_dir+'huggyllama/llama-7b'; abbr_model_name = 'llama-7b'; max_seq_length = 2048

# dataset = 'wizardlm'; train_file = 'data/processed/wizardlm/wizardlm_data.jsonl'; abbr_train_file = 'wizardlm';
# dataset = 'wizardlmv2'; train_file = 'data/processed/wizardlm/wizardlmv2_data.jsonl'; abbr_train_file = 'wizardlmv2';
# dataset = 'dolly'; train_file = 'data/processed/dolly/dolly_data.jsonl'; abbr_train_file = 'dolly';
# dataset = 'flan_v2'; train_file = 'data/processed/flan_v2/flan_v2_data.jsonl'; abbr_train_file = 'flan_v2';
# dataset = 'oasst1'; train_file = 'data/processed/oasst1/oasst1_data.jsonl'; abbr_train_file = 'oasst1'; 
dataset = 'sharegptv2'; train_file = 'data/processed/sharegpt/sharegptv2_data.jsonl'; abbr_train_file = 'sharegptv2';
# dataset = 'stanford_alpaca'; train_file = 'data/processed/stanford_alpaca/stanford_alpaca_data.jsonl'; abbr_train_file = 'stanford_alpaca';
# dataset = 'ultrachat200kv2'; train_file = 'data/processed/ultrachat/ultrachat200kv2_train_data.jsonl'; abbr_train_file = 'ultrachat200kv2';
# dataset = 'open_orca_slim'; train_file = 'data/processed/open_orca/open_orca_slim_data.jsonl'; abbr_train_file = 'openorcaslim'; 
# dataset = 'tulu_v2'; train_file = 'data/processed/tulu_v2/tulu_v2_data.jsonl'; abbr_train_file = 'tulu_v2';

M = 80_000; pacing_fn_list = [f'prune_size={M}_ep=2']; subset_size = 40_000
# M = 60_000; pacing_fn_list = [f'prune_size={M}_ep=3']; subset_size = 20_000
# M = 50_000; pacing_fn_list = [f'prune_size={M}_ep=5']; subset_size = 10_000
# M = 20_000; pacing_fn_list = [f'prune_size={M}_ep=4']; subset_size = 5_000
# M = 10_000; pacing_fn_list = [f'prune_size={M}_ep=10']; subset_size = 1_000

scoring_fn_list = []
# 
scoring_fn_list += ['random_s=0'] # , 'random_s=1'
# scoring_fn_list += ['random_s=1']
# scoring_fn_list += ['log_prob_neg', 'el2n_agg=mean', 'grad_loraB_l2n', 'ifd_neg', 'log_pmi_neg']

# scoring_fn_list += [
#     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=mpnet',
#     f'dppmap_k=rbf_gamma=auto{subset_size}_kmd=llama7b_kemb=text+embedding',
#     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=llama7b_kemb=grad+rp+loraB',
# #     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=llama7b_kemb=text+embedding',
# #     f'dppmap_k=rbf_gamma=auto{subset_size}_kmd=llama7b_kemb=grad+rp+loraB',
# ]
# subset_size = 10_000
scoring_fn_list += [ # autotune gamma
#     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=mpnet',
#     f'dppmap_k=rbf_gamma=auto{subset_size}_kmd=llama7br512p4096_kemb=text+embedding',
#     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd=llama7br512p4096_kemb=grad+rp+loraB',
]
scoring_fn_list += [ # vary kernel embedding model 
#     f'dppmap_k=vmf_gamma=auto{subset_size}_kmd={kmd}_kemb=grad+rp+loraB'
#     for kmd in ['llama7br256p4096', 'llama7br512p4096', 'pythia1br512p4096']
]
scoring_fn_list += [ # use a sufficiently large gamma
    f'dppmap_k=vmf_gamma=1_kmd=llama7br512p4096_kemb=grad+rp+loraB',
    f'dppmap_k=rbf_gamma=1e-3_kmd=llama7br512p4096_kemb=text+embedding',
]
scoring_fn_and_pacing_fn = list(itertools.product(scoring_fn_list, pacing_fn_list))
##### 

# ### mistral-7b on ultrachat15/200k
# model_name_or_path = hf_models_dir+'mistralai/Mistral-7B-v0.1'; abbr_model_name = 'mistral-7b'; max_seq_length = 2048
# # M =  50_000; dataset = 'ultrachat200k'; train_file = 'data/processed/ultrachat/ultrachat200k_train_data.jsonl'; abbr_train_file = 'ultrachat200k'
# M = 100_000; dataset = 'ultrachat15';   train_file = 'data/processed/ultrachat/ultrachat15_data.jsonl'; abbr_train_file = 'ultrachat15'; preprocessing_num_workers = 64
# # M = 400_000; dataset = 'ultrachat15';   train_file = 'data/processed/ultrachat/ultrachat15_data.jsonl'; abbr_train_file = 'ultrachat15'; preprocessing_num_workers = 64

# # scoring_fn_list = ['log_prob_neg', 'el2n_agg=mean', 'grad_loraB_l2n', 'el2n_agg=mean']
# # scoring_fn_list = ['numtoks_input_neg']
# # scoring_fn_list = ['log_prob_neg', 'el2n_agg=mean', 'logit_margin_neg', 'grad_loraB_l2n',] #  'kmeansl2_emb=text+embedding_nc=3000_incr'
# # scoring_fn_list = ['rhov1_log_prob', 'rhov1_log_prob_neg']
# # scoring_fn_list = ['numtoks_input_neg', 'numtoks_output_neg', 'numtoks_total_neg']
# # scoring_fn_list = [
# #     'semdedup_cl=kmeansfaisscd_md=mpnet_dist=cd_emb=text+embedding_nc=200',
# #                    'semdedup_cl=kmeansfaisscd_md=mistral7b_dist=cd_emb=text+embedding_nc=200',
# #                    'semdedup_cl=kmeansfaisscd_md=mistral7b_dist=cd_emb=grad+rp+loraB_nc=200',
# #                   ]
# # scoring_fn_list = ['dppmapbd_nc=200_k=lin_kmd=mpnet']
# # scoring_fn_list =[f'dppmapbd_nc=200_k=vmf_gamma={gamma}_kmd=mpnet' for gamma in [.3, 3.]]
# # scoring_fn_list = ['dppmapbd_nc=200_k=vmf_gamma=0.000035_kmd=mpnet']
# scoring_fn_list = ['dppmapbd_nc=200_k=vmf_gamma=1.0_kmd=mpnet'] 
# pacing_fn_list = [f'prune_size={M}_ep=2']
# scoring_fn_and_pacing_fn = list(itertools.product(scoring_fn_list, pacing_fn_list))
# ##### 



# evaluation_strategy = 'steps' if 'ultrachat' in dataset else 'no'
job_name = f'oi5_{dataset}:{abbr_model_name}'
num_train_epochs_list = [1] # offload handling of epochs to `generate_curriculum`
dataloader_sampler = 'SequentialSampler'
subsample_inds_file_list = []
for scoring_fn, pacing_fn in scoring_fn_and_pacing_fn:
    from note_pruning import get_final_model_name
    from note_pruning_analysis import get_full_model_name
    md_expand = get_final_model_name(abbr_model_name, scoring_fn)
    if dataset in ['wizardlmv2', 'ultrachat200kv2']: # only relied on llama7br512p4096
        if 'llama' in abbr_model_name:
            if not any(y in scoring_fn for y in ['mpnet', 'bge']):
                md_expand = get_full_model_name('llama7br512p4096')
        else:
            raise ValueError('need to specify md_expand!')
    if md_expand == 'llama-7b': md_expand = get_full_model_name('llama7b')
    print(f'md={abbr_model_name} scoring_fn={scoring_fn}\t->\tmd_expand={md_expand}')
    data_inds_dir = (f'/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/scripts/curriculum/'
                     f'{md_expand}/{dataset}/')
    p = os.path.join(data_inds_dir, scoring_fn, 'inds_'+pacing_fn+'.pkl')
    if not os.path.isfile(p):
        raise ValueError(f'path={p} does not exists for {scoring_fn}')
    subsample_inds_file_list.append(p)

#####


#############

# #### sft baselines
# job_name = 'oi2'; num_train_epochs_list = [2] 
# # model_name_or_path = hf_models_dir+'NousResearch/Llama-2-7b-hf'; abbr_model_name = 'llama2-7b'; max_seq_length = 2048
# model_name_or_path = hf_models_dir+'huggyllama/llama-7b'; abbr_model_name = 'llama-7b'; max_seq_length = 2048

# # train_file = 'data/processed/sharegpt/sharegptv2_data.jsonl'; abbr_train_file = 'sharegptv2'
# # train_file = 'data/processed/open_orca/open_orca_slim_data.jsonl'; abbr_train_file = 'openorcaslim'; max_train_samples_list=[100_000]
# train_file = 'data/processed/stanford_alpaca/stanford_alpaca_data.jsonl'; abbr_train_file = 'stanford_alpaca'; 
# # train_file = 'data/processed/flan_v2/flan_v2_data.jsonl'; abbr_train_file = 'flan_v2';
# # train_file = 'data/processed/oasst1/oasst1_data.jsonl'; abbr_train_file = 'oasst1';
# # train_file = 'data/processed/dolly/dolly_data.jsonl'; abbr_train_file = 'dolly';
# # train_file = 'data/processed/wizardlm/wizardlmv2_data.jsonl'; abbr_train_file = 'wizardlmv2';
# # train_file = 'data/processed/tulu_v2/tulu_v2_data.jsonl'; abbr_train_file = 'tulu_v2'; max_train_samples_list=[100_000]

# ####


# ##### code instruction tuning
# model_name_or_path = hf_models_dir+'codellama/CodeLlama-7b-hf'; abbr_model_name = 'codellama-7b'; max_seq_length = 2048

# # job_name = 'oi6_starcoder_ep=5'; # 5k train for 5 epochs
# # num_train_epochs_list = [5]
# # train_file = 'data/processed/starcoder/starcoder_commentinstr.jsonl'; abbr_train_file = 'starcodercmtinstr'
# # train_file = 'data/processed/starcoder/starcoder_commentinstr_cleaned.jsonl'; abbr_train_file = 'starcodercmtinstrcleaned'
# # train_file = 'data/processed/starcoder/starcoder_commentinstrv2.jsonl'; abbr_train_file = 'starcodercmtinstrv2'; max_train_samples_list=[5000]
# # train_file = 'data/processed/starcoder/starcoder_commentinstrv2_flppl.jsonl'; abbr_train_file = 'starcodercmtinstrv2_flppl'; max_train_samples_list=[5000]
# # train_file = 'data/processed/starcoder/starcoder_commentinstrv4_rge5.jsonl'; abbr_train_file = 'starcodercmtinstrv4_rge5'; max_train_samples_list=[5000]
# # train_file = 'data/processed/starcoder/starcoder_commentinstrv4.jsonl'; abbr_train_file = 'starcodercmtinstrv4'; max_train_samples_list=[5000]

# job_name = 'oi2'; 
# num_train_epochs_list = [2]
# # train_file = 'data/processed/starcoder/starcoder_commentinstrv2.jsonl'; abbr_train_file = 'starcodercmtinstrv2'
# train_file = 'data/processed/starcoder/starcoder_commentinstrv5.jsonl'; abbr_train_file = 'starcodercmtinstrv5'

# ##### 



#############

test_run = 1
test_run = bool(test_run)
debug_mode = test_run

nodes = 5; num_gpus = 6; gpu_type = 'v100'; job_duration = 6 
# nodes = 5; num_gpus = 6; gpu_type = 'v100'; job_duration = 12
# nodes = 5; num_gpus = 6; gpu_type = 'v100'; job_duration = 18 

overwrite_output_dir = True if test_run else False # always continue from ckpt if run from cluster.

per_device_train_batch_size = 2; total_batch_size = 128 
gradient_accumulation_steps = round(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)
effective_batch_size = per_device_train_batch_size*nodes*num_gpus*gradient_accumulation_steps

optimizer = 'adamw_hf'

deepspeed = ''; fsdp = False if num_gpus == 1 else "full_shard auto_wrap" 
if 'gpt2' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'GPT2Block'
elif 'llama' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'LlamaDecoderLayer'
elif 'mpt' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'MPTBlock'
elif 'pythia' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'GPTNeoXLayer'        
elif 'mistral' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'MistralDecoderLayer'
else: raise ValueError('Not sure how to set `fsdp_transformer_layer_cls_to_wrap`')
    
# deepspeed = './ds_configs/ds_zero3_cpu_offload.json'; fsdp = False
# deepspeed = './ds_configs/ds_zero3.json'; fsdp = False
# deepspeed = './ds_configs/stage3_no_offloading.conf'; fsdp = False # error with loading... something wrong with the config.
# fsdp = False; deepspeed = False

if fsdp and deepspeed:
    raise ValueError('either fsdp or deepspeed, not both')

use_lora = False
lora_rank = 256 # test {8, 16, 32, 128} # just [128, 8] for now.
lora_alpha = lora_rank 
lora_dropout = 0.05
if use_lora:
    abbr_model_name += f'+lora(r={lora_rank},a={lora_alpha})'
mixed_precision = 'bf16' if arch == 'x86_64' else 'fp16' # ; mixed_precision = None
torch_dtype = 'bfloat16' if arch=='x86_64' else 'float16'; torch_dtype = 'float32'
gradient_checkpointing = True
load_in_8bit = False


print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{per_device_train_batch_size} batch size per GPU, "
      f"{gradient_accumulation_steps} gradient accumulation steps, "
      f"Effective batch size {effective_batch_size}")


if nodes == 1:
    exe = 'python' if num_gpus==1 else \
        f"torchrun --nproc_per_node={num_gpus} --master_port=10002"
else:
    exe = f"torchrun --nnodes={nodes} --nproc_per_node={num_gpus} --rdzv-id=$SLURM_JOB_ID --rdzv-backend=c10d --rdzv-endpoint=$RDZV_ENDPOINT"

if test_run:
    exe = f"CUDA_VISIBLE_DEVICES={','.join(map(str, range(num_gpus)))} {exe}"
if test_run and debug_mode:
    exe = 'TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO LOGLEVEL=INFO ' + exe
    error_file='/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/error_file'
    exe = f'TORCHELASTIC_ERROR_FILE={error_file} {exe}'

if not os.path.isfile(train_file):
    print(f'train_file={train_file} does not exists')

options_list = itertools.product(
    num_train_epochs_list,
    subsample_inds_file_list,
    max_train_samples_list,
)

output_dirname_list = []
for (num_train_epochs,
     subsample_inds_file,
     max_train_samples,) in options_list:

    output_dirname = f"{abbr_model_name}_{abbr_train_file}"
    if max_train_samples:
        output_dirname += f":{int(max_train_samples/1000)}k"
            
    if any(job_name == y for y in ['oi2']):
        output_dirname += f'_ep={num_train_epochs}'
        
    if subsample_inds_file:
        def subsample_inds_file_abbr_fn(x):
            s = os.path.basename(x).split('.pkl')[0]
            if s.startswith('inds_'):
                scoring_fn = os.path.basename(os.path.dirname(x)).replace('_', ':')
                pacing_fn = s.split('inds_')[-1].replace('_', ':')
                return f'score={scoring_fn}_pace={pacing_fn}'
            else:
                return s
        subsample_inds_file_abbr = subsample_inds_file_abbr_fn(subsample_inds_file)
        if subsample_inds_file_abbr:
            output_dirname += f'_{subsample_inds_file_abbr}'
            
    if test_run:
        output_dirname = 'jpt_'+output_dirname
            
    if add_hardwarespec_to_dirname:
        output_dirname += \
            ('_fsdp='+fsdp.split(' ')[0] if fsdp else '')+\
            ('_deepspeed='+os.path.basename(deepspeed).split('.')[0] if deepspeed else '')+\
            ('_gradckpt='+str(gradient_checkpointing) if gradient_checkpointing else '')+\
            '_mbsz='+str(per_device_train_batch_size)+\
            '_dtype='+torch_dtype+\
            ('_mp='+str(mixed_precision) if mixed_precision else '_mp=none')+\
            '_seqlen='+str(max_seq_length)+\
            '_nodes='+str(nodes)
    if suffix:
        output_dirname += suffix
    output_dir = os.path.join('results', job_name, output_dirname)
    os.makedirs(os.path.join('results', job_name), exist_ok=True)
    wandb_run_name = output_dir.replace('results/', '')
    

    cmd = f"""
    {'!cd .. && ' if test_run else ''}{exe}
        open_instruct/finetune_trainer.py \
        --model_name_or_path={model_name_or_path} \
        --tokenizer_name={model_name_or_path} \
        {'--load_in_8bit' if load_in_8bit else ''} \
        --use_fast_tokenizer=True \
        --train_file={train_file} \
        --max_seq_length={max_seq_length} \
        {'--max_train_samples='+str(max_train_samples) if max_train_samples else ''} \
        {'--use_lora' if use_lora else ''}
        {'--lora_rank='+str(lora_rank) if use_lora else ''}
        {'--lora_alpha='+str(lora_alpha) if use_lora else ''}
        {'--lora_dropout='+str(lora_dropout) if use_lora else ''}
        --do_train \
        --preprocessing_num_workers={preprocessing_num_workers} \
        --per_device_train_batch_size={per_device_train_batch_size} \
        --gradient_accumulation_steps={gradient_accumulation_steps} \
        --learning_rate=2e-5 \
        --lr_scheduler_type={lr_scheduler_type} \
        --warmup_ratio={warmup_ratio} \
        --weight_decay=0. \
        --optim={optimizer} \
        --evaluation_strategy={evaluation_strategy} \
        {'--eval_steps='+str(eval_steps) if eval_steps else ''} \
        {'--report_to '+str(report_to) if report_to else ''} \
        --run_name {wandb_run_name} \
        --logging_strategy=steps \
        --logging_first_step \
        --logging_steps=1 \
        --save_strategy={save_strategy} \
        --save_steps={save_steps} \
        --save_total_limit={save_total_limit} \
        --num_train_epochs={num_train_epochs} \
        --ddp_timeout=7200 \
        {'--fsdp="'+fsdp+'"' if fsdp else ''}
        {'--fsdp_transformer_layer_cls_to_wrap="'+fsdp_transformer_layer_cls_to_wrap+'"' 
            if fsdp else ''}
        {'--gradient_checkpointing' if gradient_checkpointing  else ''}
        --torch_dtype={torch_dtype} \
        --dataloader_num_workers=8 \
        {f'--{mixed_precision}=True' if mixed_precision else ''} \
        {'--overwrite_output_dir' if overwrite_output_dir else ''} \
        {'--deepspeed='+deepspeed if deepspeed else ''} \
        {'--subsample_inds_file='+subsample_inds_file if subsample_inds_file else ''} \
        {'--dataloader_sampler '+str(dataloader_sampler) if dataloader_sampler else ''} \
        --use_flash_attn False \
        --low_cpu_mem_usage \
        --overwrite_cache \
        --output_dir="{output_dir}" \
    """ 
    #    --overwrite_cache # if delete a dataset and need to refresh cache

    cmd = multiline_to_singleline(cmd)
    if test_run:
        print()
        print(cmd)

    shell_scripts = shell_scripts_template.format(
        conda_env='open-instruct',
        cwd=os.path.dirname(os.getcwd()),
        cmd=cmd,
        log_dir=os.getcwd(),
        save_dir=output_dir
    )
    out = submit_job(
        shell_scripts, 
        job_name=job_name, 
        nodes=nodes,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=num_gpus,
        gpu_type=gpu_type,
        test_run=test_run,
        job_duration=job_duration,
    )
    if not test_run:
        print(out)

md=llama-7b scoring_fn=random_s=0	->	md_expand=llama-7b+lora:r=256:a=256
md=llama-7b scoring_fn=dppmap_k=vmf_gamma=1_kmd=llama7br512p4096_kemb=grad+rp+loraB	->	md_expand=llama-7b+lora:r=512:a=11585+proj=4096
md=llama-7b scoring_fn=dppmap_k=rbf_gamma=1e-3_kmd=llama7br512p4096_kemb=text+embedding	->	md_expand=llama-7b+lora:r=512:a=11585+proj=4096
Training results/baselines/huggyllama/llama-7b using 6 GPUs, 2 batch size per GPU, 2 gradient accumulation steps, Effective batch size 120

!cd .. && TORCHELASTIC_ERROR_FILE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/error_file TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO LOGLEVEL=INFO CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun --nnodes=5 --nproc_per_node=6 --rdzv-id=$SLURM_JOB_ID --rdzv-backend=c10d --rdzv-endpoint=$RDZV_ENDPOINT open_instruct/finetune_trainer.py --model_name_or_path=results/baselines/huggyllama/llama-7b --tokenizer_name=results/baselines/huggyllama/llama-7b --use_fast_tokenizer=True --train_file=

In [5]:
cmd

'!cd .. && TORCHELASTIC_ERROR_FILE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/scripts/error_file TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO LOGLEVEL=INFO CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun --nnodes=5 --nproc_per_node=6 --rdzv-id=$SLURM_JOB_ID --rdzv-backend=c10d --rdzv-endpoint=$RDZV_ENDPOINT open_instruct/finetune_trainer.py --model_name_or_path=results/baselines/huggyllama/llama-7b --tokenizer_name=results/baselines/huggyllama/llama-7b --use_fast_tokenizer=True --train_file=data/processed/sharegpt/sharegptv2_data.jsonl --max_seq_length=2048 --do_train --preprocessing_num_workers=32 --per_device_train_batch_size=2 --gradient_accumulation_steps=2 --learning_rate=2e-5 --lr_scheduler_type=linear --warmup_ratio=0.03 --weight_decay=0. --optim=adamw_hf --evaluation_strategy=no --eval_steps=100 --report_to tensorboard wandb --run_name oi5_sharegptv2:llama-7b/jpt_llama-7b_sharegptv2_score=dppmap:k=rbf:gamma=1e-3:kmd=llama7br512p4096:kemb=text+embedding_pace=pr

# eval



In [47]:
from gen_cmds_utils import remove_all_symlinks, create_unique_symlinks, get_chat_formatting_function, get_resource_for_task

create_symlinks = False
include_checkpoints = False
eval_rest = True
subdir_path_list = []
subdir_filter_fn = lambda x: True
num_cpus = 24; cpu_mem = 64
use_slow_tokenizer = True

task_names = [
    'mmlu_s=0',
    'mmlu_s=5', 
    'gsm_s=8',
    'gsm_s=8_cot',
    'bbh_s=3',
    'bbh_s=3_cot', # max_datapoints_per_task=40 -> 40min.
    'humaneval',
    'tydiqa_s=1_cb', # 3min
    'tydiqa_s=1_gp',
    # 'toxigen', # ~1.5hr
#     'alpacafarm_ann=gpt35:turbo:1106',
    # 'alpacafarm_ann=chatgpt', # ~$1 per eval.
]
task_names_alpacafarm = ['alpacafarm_ann=chatgpt_chatfmt']
task_names_chatfmt = [x+'_chatfmt' for x in task_names]


# # ## baselines eval 
# subdir_path_list = [os.path.join('results/baselines', x) for x in [
# #     'gpt2',
# #     'gpt2-medium',
# #     'huggyllama/llama-7b', 
# #     'mistralai/Mistral-7B-v0.1',
# #     'mistralai/Mistral-7B-Instruct-v0.1',
# #     'NousResearch/Llama-2-7b-hf',
# #     'NousResearch/Llama-2-7b-chat-hf',
# #     'HuggingFaceH4/mistral-7b-sft-alpha',
# #     'HuggingFaceH4/mistral-7b-sft-beta',
# #     'HuggingFaceH4/zephyr-7b-alpha',
# #     'HuggingFaceH4/zephyr-7b-beta',
# #     'EleutherAI/pythia-1.4b',
# #     'EleutherAI/pythia-2.8b',
# #     'EleutherAI/pythia-6.9b',
# #     'databricks/dolly-v2-7b',
# #     'codellama/CodeLlama-7b-hf',
# #     'codellama/CodeLlama-7b-Python-hf',
# #     'codellama/CodeLlama-7b-Instruct-hf',
# ]]
# task_names = task_names+task_names_chatfmt

# # ## baseline re-eval after merge upstream/main
# subdir_path_list = [os.path.join('results/baselines', x) for x in [
#     'huggyllama/llama-7b',
# ]]
# subdir_path_list += ['results/ft1/llama-7b_humanmix']
# task_names = task_names+task_names_chatfmt

# ## ft1
# exp_dir = 'results/ft1'
# task_names = task_names+task_names_chatfmt

# ## ft1_ep=1
# # exp_dir = 'results/ft1_ep=1'
# # exp_dir = 'results/ft1_ep=2'
# exp_dir = 'results/oi2'
# subdir_filter_fn = lambda x: 'sharegptv2' in x
# task_names = task_names_chatfmt
# # task_names = task_names+task_names_chatfmt
# # task_names = task_names_alpacafarm


# ## ft2
# exp_dir = 'results/ft2/'
# create_symlinks = True
# subdir_filter_fn = lambda x: any(y in x for y in ['llama-7b'])
# task_names = task_names+task_names_chatfmt

# ## llama-7b time-series 400k, 600k
# exp_dir = 'results/oi3/'
# include_checkpoints = True
# subdir_filter_fn = lambda x: any(y in x for y in ['400k', '600k']) # , '600k'
# task_names = task_names+task_names_chatfmt

# oi4 include checkpoints!
# exp_dir = 'results/oi4_perf_cross_time/'
# exp_dir = 'results/oi4_tulu_v1_human_mix/'
# exp_dir = 'results/oi4_flanv2_prune_with_hmv1_model/'
# exp_dir = 'results/oi4_flan2022_1m/'
# exp_dir = 'results/oi4_tulu_v1_mix/'
# exp_dir = 'results/oi4_tulu_v1_mix_ep=3/'
# include_checkpoints = False
# subdir_filter_fn = lambda x: any(y in x for y in ['random']) #$ ['log_prob_decr', 'el2n_agg=mean_incr', 'logit_margin_decr', 'grad_loraB']
# task_names = task_names_chatfmt
# task_names = task_names_alpacafarm

# # oi4 without checkpoint 
# # exp_dir = 'results/oi4/'
# exp_dir = 'results/oi4_flan_v2_vary_subsetsize/'
# task_names = task_names_chatfmt

# # oi5
exp_dir = 'results/oi2'
# exp_dir = 'results/oi5_tulu_v1_mix:llama-7b/'
# exp_dir = 'results/oi5_ultrachat:mistral-7b'
# exp_dir = 'results/oi5_ultrachat200k:mistral-7b'
# exp_dir = 'results/oi5_ultrachat15:mistral-7b'
# exp_dir = 'results/oi5_ultrachat200kv2:llama-7b'
# exp_dir = 'results/oi5_wizardlm:llama-7b'
# exp_dir = 'results/oi5_wizardlmv2:llama-7b'
# exp_dir = 'results/oi5_sharegptv2:llama-7b'
# exp_dir = 'results/oi5_tulu_v2:llama-7b'
# exp_dir = 'results/oi5_open_orca_slim:llama-7b'
# exp_dir = 'results/oi5_stanford_alpaca:llama-7b'
# exp_dir = 'results/oi5_flan_v2:llama-7b'
# exp_dir = 'results/oi5_dolly:llama-7b'
# exp_dir = 'results/oi5_oasst1:llama-7b'
# exp_dir = 'results/oi6_starcoder_ep=5'
# exp_dir = 'results/oi5_starcoder_commentinstr:codellama-7b'
# exp_dir = 'results/oi5_starcoder_commentinstrv2:codellama-7b'
# exp_dir = 'results/oi5_starcoder_commentinstrv4:codellama-7b'
# exp_dir = 'results/oi5_starcoder_commentinstrv5:codellama-7b'
subdir_filter_fn = lambda x: 'flan_v2' in x
# task_names = task_names + task_names_chatfmt
task_names = task_names_alpacafarm
# task_names = ['humaneval', 'humaneval_chatfmt']
# task_names = ['alpacafarm_ann=gpt35:turbo:1106_chatfmt']


# ### code
# exp_dir = 'results/oi2'
# subdir_filter_fn = lambda x: 'starcoder' in x
# task_names = ['humaneval', 'humaneval_chatfmt']
# ### 

test_run = 1
test_run = bool(test_run)


if len(subdir_path_list)==0:
    if create_symlinks:
        remove_all_symlinks(exp_dir)
    subdir_path_list = []
    subdirs = list(os.listdir(exp_dir))
    subdirs = filter(subdir_filter_fn, subdirs)
    for subdir in subdirs:
        subdir_path = os.path.join(exp_dir, subdir)
        if include_checkpoints:
            subdir_path_list += glob.glob(os.path.join(subdir_path, 'checkpoint-*'))
        if not os.path.isfile(os.path.join(subdir_path, 'config.json')): # skip runs not yet finished
            continue
        subdir_path_list.append(subdir_path)

if eval_rest:
    task_name_and_model = []
    for subdir_path in subdir_path_list:
        for task_name in task_names:
            if not os.path.islink(subdir_path) and \
                not os.path.isfile(os.path.join(subdir_path, 'eval', task_name, 'metrics.json')):
                task_name_and_model.append((task_name, subdir_path))
                print((task_name, subdir_path))
else:
    task_name_and_model = list(itertools.product(task_names, subdir_path_list))
    

print('#cmds: ', len(list(task_name_and_model)), '\n')

if create_symlinks:
    # create symlink for each directory.
    symlink_path_dict = create_unique_symlinks(
        list([x[1] for x in task_name_and_model]))
    options_list = list(map(lambda x: (x[0], symlink_path_dict[x[1]]), task_name_and_model))
else:
    options_list = task_name_and_model
    
    

info = {}  
cmds = []
for task_name, model_name_or_path in options_list:
    
    use_chat_format = 'chatfmt' in task_name
    chat_formatting_function = get_chat_formatting_function(model_name_or_path)
    
    try:
        with open(os.path.join(model_name_or_path, 'ft_args.json'), 'r') as f:
            ft_args = json.load(f)
        # note `model_name_or_path` could be anything, e.g., soft links with arbitrary names.
        # but `ft_args_model_name_or_path` indicates the finetuned model name.
        ft_args_model_name_or_path = ft_args['model_args']['model_name_or_path']
    except:
        ft_args_model_name_or_path = model_name_or_path

    if 'gpt2' in ft_args_model_name_or_path:
        tydiqa_max_context_length = 400 # max ctx len without exceeding max_seq_len
    else:
        tydiqa_max_context_length = 512
    batch_size, job_duration = get_resource_for_task(
        task_name, ft_args_model_name_or_path)
    
    job_name = f'eval.{task_name}'
    run_id = model_name_or_path
    save_dir = f'{model_name_or_path}/eval/{task_name}'
    
    if task_name.startswith('mmlu'):
        match = re.search(r's=(\d+)', task_name)
        n_shot = int(match.group(1))
        assert(n_shot <= 5)
        cmd = f"""
        python -m eval.mmlu.run_eval \
            --data_dir data/eval/mmlu \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            --ntrain {n_shot} \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('gsm'):
        match = re.search(r's=(\d+)', task_name)
        n_shot = int(match.group(1))
        assert(n_shot <= 8)
        # open-instruct used 200 examples. use higher amount to get a more accurate number
        cmd = f"""
        python -m eval.gsm.run_eval \
            --data_dir data/eval/gsm/ \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            --max_num_examples 500 \
            --n_shot {n_shot} \
            --max_new_tokens 256 \
            {'--no_cot' if 'cot' not in task_name else ''} \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('bbh'):
        max_num_examples_per_task = 40
        match = re.search(r's=(\d+)', task_name)
        n_shot = int(match.group(1))
        assert(n_shot <= 3)
        cmd = f"""
        python -m eval.bbh.run_eval \
            --data_dir data/eval/bbh/ \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            --max_new_tokens 256 \
            --n_shot {n_shot} \
            {'--no_cot' if 'cot' not in task_name else ''} \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--max_num_examples_per_task '+str(max_num_examples_per_task) if max_num_examples_per_task else ''} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('humaneval'):
        cmd = f"""
        python -m eval.codex_humaneval.run_eval \
            --data_file data/eval/codex_humaneval/HumanEval.jsonl.gz \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            --max_new_tokens 512 \
            --eval_pass_at_ks 1 \
            --unbiased_sampling_size_n 3 \
            --temperature 0.1 \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('tydiqa'):
        no_context = 'cb' in task_name
        match = re.search(r's=(\d+)', task_name)
        n_shot = int(match.group(1))
        assert(n_shot in [0,1])
        cmd = f"""
        python -m eval.tydiqa.run_eval \
            --data_dir data/eval/tydiqa \
            --n_shot {n_shot} \
            --max_num_examples_per_lang 100 \
            --max_context_length {tydiqa_max_context_length} \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            {'--no_context' if no_context else ''} \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('toxigen'):
        # max_prompts_per_group=500 (out of 1000) is open-instruct default.
        # eval batch size=1 much faster (llama-7b) not sure why.
        cmd = f"""
        python -m eval.toxigen.run_eval \
            --data_dir data/eval/toxigen \
            --model_name_or_path "{model_name_or_path}" \
            --save_dir "{save_dir}" \
            --eval_batch_size 1 \
            --max_prompts_per_group 200 \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    elif task_name.startswith('alpacafarm'):
        match = re.search(r'ann=([^_]+)', task_name)
        annotators_config = match.group(1)
        annotators_config = annotators_config.replace(':', '_')
        if not annotators_config in ['chatgpt', 'alpaca_eval_gpt4_0314', 'gpt35_turbo_1106']:
            raise ValueError('Just support 2 annotators_config.')
        cmd = f"""
        python -m eval.alpaca_farm.run_eval \
            --reference_path alpaca_eval_data \
            --model_name_or_path "{model_name_or_path}" \
            --max_new_tokens 2048 \
            --save_dir "{save_dir}" \
            --eval_batch_size {batch_size} \
            --annotators_config {annotators_config} \
            {'--use_chat_format' if use_chat_format else ''} \
            --chat_formatting_function {chat_formatting_function} \
            {'--use_slow_tokenizer' if use_slow_tokenizer else ''} \
        """
    else:
        raise ValueError(f'{task_name} not supported.')
        
        
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    print(cmd)
    

    shell_scripts = shell_scripts_template.format(
        conda_env='open-instruct',
        cwd=os.path.dirname(os.getcwd()),
        cmd=cmd,
        log_dir=os.getcwd(),
        save_dir=save_dir,
    )
    out = submit_job(
        shell_scripts, 
        job_name=job_name,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=1,
        test_run=test_run,
        job_duration=job_duration,
    )
    
    

('alpacafarm_ann=chatgpt_chatfmt', 'results/oi2/llama-7b_flan_v2_ep=2')
#cmds:  1 

python -m eval.alpaca_farm.run_eval --reference_path alpaca_eval_data --model_name_or_path "results/oi2/llama-7b_flan_v2_ep=2" --max_new_tokens 2048 --save_dir "results/oi2/llama-7b_flan_v2_ep=2/eval/alpacafarm_ann=chatgpt_chatfmt" --eval_batch_size 5 --annotators_config chatgpt --use_chat_format --chat_formatting_function eval.templates.create_prompt_with_tulu_chat_format --use_slow_tokenizer

Submiting job with:
{
    "job_name": "eval.alpacafarm_ann=chatgpt_chatfmt",
    "num_cpus": 24,
    "cpu_mem": 64,
    "num_gpus": 1,
    "test_run": false,
    "queue": "el8",
    "num_jobs": 1
}


# Visualize Eval Results

In [45]:
from rosemary import pd_sort_rows_by_avg_ranking
from llm.evaluate import EvalResults, get_eval_results



exp_dir = ''
chat_fmt = None
sort_rows = True
use_normalized_preferred_metric = False


# ## investigate code change / package update effect on eval baselines.
# exp_dir = '../results/ft1_ep=2'
# use_normalized_preferred_metric = False
# sort_rows = False
# save_dirs = [
#     # llama
#     ('llama-7b_12.13update_before', '../results/baselines/huggyllama/llama-7b_12.13update_before/'),
#     ('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
# #     ('llama-7b_10.30update', '../results/baselines/huggyllama/llama-7b_10.30update/'),
# #     ('llama-7b_09.23update', '../results/baselines/huggyllama/llama-7b_09.23update/'),
# #     ('llama-7b_09.23update_before', '../results/baselines/huggyllama/llama-7b_09.23update_before/'),
# #     # llama2
#     ('llama2-7b_12.13update_before', '../results/baselines/NousResearch/Llama-2-7b-hf_12.13update_before/'),
#     ('llama2-7b', '../results/baselines/NousResearch/Llama-2-7b-hf/'),
#     ('llama2-7b-chat', '../results/baselines/NousResearch/Llama-2-7b-chat-hf/'),
# #     ('llama2-7b_10.30update', '../results/baselines/NousResearch/Llama-2-7b-hf_10.30update/'),
# #     ('llama2-7b_original', '../results/baselines/NousResearch/Llama-2-7b-hf_original/'),
# #     # mistral
# #     ('mistral-7b_10.16update', '../results/baselines/mistralai/Mistral-7B-v0.1_10.16update/'),
#     ('mistral-7b-Instruct-v0.1_12.13update_before', '../results/baselines/mistralai/Mistral-7B-Instruct-v0.1_12.13update_before'),
#     ('mistral-7b-Instruct-v0.1', '../results/baselines/mistralai/Mistral-7B-Instruct-v0.1'),
#     # zephyr
#     ('zephyr-7b-beta_12.13update_before', '../results/baselines/HuggingFaceH4/zephyr-7b-beta_12.13update_before'),
#     ('zephyr-7b-beta', '../results/baselines/HuggingFaceH4/zephyr-7b-beta'),
# ]

# # baselines
# save_dirs = []
# save_dirs += [
# #     ('gpt2', '../results/baselines/gpt2'),
# #     ('gpt2m', '../results/baselines/gpt2-medium'),
# #     ('llama-7b_humanmix', '../results/ft1/llama-7b_humanmix'),
#     ('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
#     ('llama2-7b', '../results/baselines/NousResearch/Llama-2-7b-hf/'),
# #     ('llama2-7b+humanmix', '../results/llama2-7b_humanmix'),
# #     ('pythia-1.4b', '../results/baselines/EleutherAI/pythia-1.4b'),
# #     ('pythia-2.8b', '../results/baselines/EleutherAI/pythia-2.8b'),
# #     ('pythia-6.9b', '../results/baselines/EleutherAI/pythia-6.9b'),
# #     ('dolly-v2-7b', '../results/baselines/databricks/dolly-v2-7b'),
#     ('mistral-7b-v0.1', '../results/baselines/mistralai/Mistral-7B-v0.1'),
# ]


# save_dirs = [
#     ('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
#     ('llama-7b+lima_ep=2', '../results/ft1_ep=2/llama-7b_lima/'),
# #     ('mistral-7b+lima_ep=2', '../results/ft1_ep=2/mistral-7b_lima/'), 
# ]
# exp_dir = '../results/oi2/'
# save_dirs += [(os.path.basename(x), x) for x in 
#               [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]



# exp_dir = '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/drm2'
# save_dirs += [(os.path.basename(x), x) for x in 
#               [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]

# # exp_dir = '../results/ft2'
# # exp_dir = '../results/ft1'
# exp_dir = '../results/ft1_ep=2'
# # save_dirs = [
# #     ('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
# #     ('mistral-7b', '../results/baselines/mistralai/Mistral-7B-v0.1/'),
# #     ('mistral-7b-Instruct', '../results/baselines/mistralai/Mistral-7B-Instruct-v0.1/'),
# # ]
# save_dirs += [(os.path.basename(x), x) for x in 
#               [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)] if 'tuluv1m' in x]

# exp_dir = '../results/oi3'
# save_dirs = [('llama-7b', '../results/baselines/huggyllama/llama-7b/')]
# save_dirs += [(os.path.basename(x), x) for x in 
#              glob.glob(os.path.join(exp_dir, 'llama-7b_all:600k_humanmix', 'checkpoint-*'))]

# # exp_dir = '../results/oi4'
# # exp_dir = '../results/oi4_perf_cross_time'
# # exp_dir = '../results/oi4_flanv2_prune_with_hmv1_model'
# exp_dir = '../results/oi4_flan_v2_vary_subsetsize'
# save_dirs = [('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
#              ('llama-7b_flan_v2_ep=1', '../results/ft1_ep=1/llama-7b_flan_v2'),
#              ('llama-7b_humanmix_ep=1', '../results/ft1_ep=1/llama-7b_hmv1'),
#             ]
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]

# exp_dir = '../results/oi4_flan2022_1m'
# save_dirs = [('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
# #              ('llama-7b_flan_v2_ep=2', '../results/ft1/llama-7b_flan_v2'),
# #              ('llama-7b_humanmix_ep=2', '../results/ft1/llama-7b_humanmix'),
#              ('llama-7b_flan_v2_ep=1', '../results/ft1_ep=1/llama-7b_flan_v2'),
#              ('llama-7b_humanmix_ep=1', '../results/ft1_ep=1/llama-7b_hmv1'),
# #              ('llama-7b_cot:flan_v2_ep=1', '../results/ft1_ep=1/llama-7b_cot:flanv2'),
#             ]
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]


# # exp_dir = '../results/oi4_tulu_v1_mix'
# exp_dir = '../results/oi4_tulu_v1_mix_ep=3'
# use_normalized_preferred_metric = False
# save_dirs = [('llama-7b', '../results/baselines/huggyllama/llama-7b/'),
#              ('llama-7b_tuluv1_mix_ep=2', '../results/ft1_ep=2/llama-7b_tuluv1m'),
#             ]
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]

# ###### ultrachat
# save_dirs = [
#     # baselines 
#     ('mistral-7b', '../results/baselines/mistralai/Mistral-7B-v0.1/'),
#     ('mistral-7b_ultrachat200k_aftersplitlongconv_ep=2', '../results/ft1_ep=2/mistral-7b_ultrachat200k'),
#     ('mistral-7b_ultrachat200k_beforesplitlongconv_ep=2', '../results/ft1_ep=2/mistral-7b_ultrachat200k_beforesplitlongconv'),
    
#     ('mistral-7b-Instruct', '../results/baselines/mistralai/Mistral-7B-Instruct-v0.1'),
#     ('mistral-7b_sft-alpha', '../results/baselines/HuggingFaceH4/mistral-7b-sft-alpha'),
#     ('mistral-7b-sft-beta', '../results/baselines/HuggingFaceH4/mistral-7b-sft-beta'),
#     ('mistral-7b-sft-alpha+dpo', '../results/baselines/HuggingFaceH4/zephyr-7b-alpha'),
#     ('mistral-7b-sft-beta+dpo', '../results/baselines/HuggingFaceH4/zephyr-7b-beta'),
# ]
# # exp_dir = '../results/oi5_ultrachat:mistral-7b'
# # save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]
# # exp_dir = '../results/oi5_ultrachat200k:mistral-7b'
# # save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]
# exp_dir = '../results/oi5_ultrachat15:mistral-7b'
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]
# #####


#####
# dataset = 'stanford_alpaca'
# dataset = 'open_orca_slim'
# dataset = 'sharegptv2'
# dataset = 'ultrachat200kv2'
# dataset = 'wizardlm'
# dataset = 'wizardlmv2'
# dataset = 'tulu_v2'
# dataset = 'flan_v2'
# dataset = 'oasst1'
# dataset = 'dolly'
dataset_list = [
    'stanford_alpaca', 
    'dolly',
    'oasst1', 
    'flan_v2', 
#     'tulu_v2', 
    'wizardlmv2', 
    'sharegptv2', 
    'ultrachat200kv2',
]


## older
# dataset = 'tulu_v1_mix'
save_dirs = []
save_dirs += [('llama-7b', '../results/baselines/huggyllama/llama-7b'),
#              ('llama-7b_lima_ep=5', '../results/oi2/llama-7b_lima_ep=5/'),
#              ('llama-7b_lima_ep=10', '../results/oi2/llama-7b_lima_ep=10/'),
            ]
for dataset in dataset_list:
    if dataset == 'tulu_v2':
        save_dirs += [('llama-7b_tulu_v2:100k_ep=2', '../results/oi2/llama-7b_tulu_v2:100k_ep=2'),]
    elif dataset == 'open_orca_slim':
        save_dirs += [('llama-7b_openorcaslim:100k_ep=2', '../results/oi2/llama-7b_openorcaslim:100k_ep=2'),]
    elif dataset == 'sharegptv2':
        save_dirs += [
            ('llama-7b_sharegptv2_ep=2', '../results/oi2/llama-7b_sharegptv2_ep=2'),
            ('llama-7b_sharegpt_ep=2', '../results/ft1_ep=2/llama-7b_sharegpt'),]
    elif dataset == 'tulu_v1_mix':
        save_dirs += [
            ('llama-7b_tuluv1_mix_ep=2', '../results/ft1_ep=2/llama-7b_tuluv1m'),
            # oi4_tulu_v1_mix_ep=3 models before transformers update.
            # ('llama-7b_tuluv1m:50k_log_prob_decr_<10.16update', '../results/oi4_tulu_v1_mix_ep=3/llama-7b_tuluv1m:50k_log_prob_decr'),
        ]
    else:
        save_dirs += [(f'llama-7b_{dataset}_ep=2', f'../results/oi2/llama-7b_{dataset}_ep=2'),]

    exp_dir = f'../results/oi5_{dataset}:llama-7b'
    save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)] if 'dppmapbd' not in x and 'semdedup' not in x]
    
# ## just compare dppmap grad vs. text
# save_dirs = [x for x in save_dirs if 'prune:size=10000:ep=10' in x[1] and (
#         'random' in x[1] or 
#         'dppmap' in x[1]
#     )
# ]
#####


# ##### code instructions
# save_dirs = [
#     ('llama2-7b', '../results/baselines/NousResearch/Llama-2-7b-hf/'),
#     ('codellama-7b', '../results/baselines/codellama/CodeLlama-7b-hf/'),
#     ('codellama-7b-instruct', '../results/baselines/codellama/CodeLlama-7b-Python-hf/'),
#     ('codellama-7b-python', '../results/baselines/codellama/CodeLlama-7b-Instruct-hf/'),
# ]

# exp_dir = '../results/oi2'
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)] if 'starcoder' in x]
# exp_dir = '../results/oi6_starcoder_ep=5'
# # save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]
# # exp_dir = '../results/oi5_starcoder_commentinstr:codellama-7b'
# # exp_dir = '../results/oi5_starcoder_commentinstrv2:codellama-7b'
# exp_dir = '../results/oi5_starcoder_commentinstrv5:codellama-7b'
# save_dirs += [(os.path.basename(x), x) for x in [os.path.join(exp_dir, x) for x in os.listdir(exp_dir)]]
# #####


###### 

from llm.evaluate import detect_oom_evals
oom_eval_paths = detect_oom_evals([x for l in [glob.glob(os.path.join(x[1], 'eval/*/*.out')) for x in save_dirs] for x in l])
if oom_eval_paths: print(oom_eval_paths)
    
cols_avg_blacklist = ['AlpacaFarm/Len']

chat_fmt = False
chat_fmt = True
# chat_fmt = 'both'
# chat_fmt = 'auto' # base model no chatfmt, tuned model with chatfmt
# chat_fmt = 'mix'  # non-alpacaeval no chatfmt, alpacaeval chatfmt
ft_args_fields = [
    'run_name',
    'model_args.model_name_or_path',
    'data_args.subsample_mixture',
    'data_args.max_train_samples',
    'data_args.train_file',
]
#     cols = ['MMLU/0-shot', 'GSM/CoT', 'BBH/CoT', 'TydiQA/GP', 'Codex-Eval/Pass@1']
#     cols = ['MMLU/0-shot', 'GSM/Direct', 'BBH/Direct', 'TydiQA/CB', 'Codex-Eval/Pass@1']

#     cols = ['MMLU/0-shot', 'GSM/CoT', 'BBH/CoT', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR']
#     cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT']

#     cols = ['MMLU/0-shot', 'GSM/CoT', 'BBH/Direct', 'TydiQA/GP', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR']
#     cols = ['MMLU/0-shot', 'GSM/CoT', 'BBH/CoT', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR']
# cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'TydiQA/CB', 'TydiQA/GP', 'Codex-Eval/Pass@1'] #  'ToxiGen/Acc'
cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'TydiQA/CB', 'TydiQA/GP', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR*', 'AlpacaFarm/Len'] 
# cols = ['AlpacaFarm/WR', 'AlpacaFarm/ΔWR', 'AlpacaFarm/Len']
# cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'TydiQA/CB', 'TydiQA/GP', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR*'] 
# cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'TydiQA/CB', 'TydiQA/GP', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR', 'AlpacaFarm/ΔWR', 'AlpacaFarm/Len'] 


# cols = [f'BBH {x}' for x in ['reasoning', 'nlu', 'knowledge', 'multilingual']]; cols = [x+'/Direct' for x in cols] + [x+'/CoT' for x in cols]
# cols = [f'MMLU {x}' for x in ['STEM', 'humanities', 'social sciences', 'other']]; cols = [x+'/0-shot' for x in cols] + [x+'/5-shot' for x in cols]

#     cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'TydiQA/CB', 'TydiQA/GP', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR', 'AlpacaFarm/Rep', 'AlpacaFarm/WR*'] #  'ToxiGen/Acc'
#     cols = ['AlpacaFarm/WR', 'AlpacaFarm/Rep', 'AlpacaFarm/WR*']
#     cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'Codex-Eval/Pass@1', 'AlpacaFarm/WR'] #  entire, without tydiqa, which has high variance
#     cols = ['MMLU/0-shot', 'MMLU/5-shot', 'GSM/Direct', 'GSM/CoT', 'BBH/Direct', 'BBH/CoT', 'Codex-Eval/Pass@1', ] #  entire, without tydiqa, which has high variance
if 'open_orca_slim' in exp_dir:
    cols = ['MMLU/0-shot', 'MMLU/5-shot', 'BBH/Direct', 'BBH/CoT']
    cols = ['MMLU/0-shot', 'BBH/Direct']
if 'starcoder' in exp_dir:
    cols = ['Codex-Eval/Pass@1']
    chat_fmt = 'both'
    ft_args_fields += ['data_args.train_file']
print(f'chat_fmt={chat_fmt}')
df = get_eval_results(save_dirs, chat_fmt=chat_fmt, ft_args_fields=ft_args_fields, use_normalized_preferred_metric=use_normalized_preferred_metric)

cols = [x for x in cols if x in df.columns]
df = df[ft_args_fields + cols]
if chat_fmt == 'both':
    for col_lvl2 in ['', 'chatfmt']:
        df[('Average', col_lvl2)] = df[list(set(df.columns) & set([(x, col_lvl2) for x in list(set(cols)-set(cols_avg_blacklist))]))].mean(axis=1)
else:
    df['Average'] = df[list(set(cols)-set(cols_avg_blacklist))].mean(axis=1)
if sort_rows:
    df = pd_sort_rows_by_avg_ranking(df); df['ranking'] = -df['ranking']
    sort_value_col, sort_value_col_ascending = ('Average', 'chatfmt') if chat_fmt=='both' else 'Average', False
#     sort_value_col, sort_value_col_ascending = 'ranking', False
    df = df.sort_values(by=sort_value_col, ascending=sort_value_col_ascending)
df = df.reset_index(drop=True)


def compute_total_train_samples(x):
    match = re.search(r'size=(\d+)', x['run_name' if chat_fmt!='both' else ('run_name', '')])
    total_train_samples = match.group(1) if match else None
    return total_train_samples
df.insert(1, 'total_train_samples' if chat_fmt!='both' else ('total_train_samples', ''), df.apply(compute_total_train_samples, axis=1))
def extract_dataset_from_train_file(x):
    x = x.split('/')[-1].split('.jsonl')[0]
    if x.endswith('_data'): x = x[:-5]
    if x.endswith('_train'): x = x[:-6]
    return x
df.insert(1, 'dataset' if chat_fmt!='both' else ('dataset', ''), df['data_args.train_file'].apply(extract_dataset_from_train_file))
df = df.drop('data_args.train_file', axis=1)

if any(exp_dir.endswith(x) for x in ['ft2']):
#     for model_name_contain in ['gpt2', 'llama', 'pythia-1.4b']:
#         for total_train_samples in [10000, 50000, 100000, 200000]:
    for model_name_contain in ['llama']:
        for total_train_samples in [10000, 50000, 100000, 200000]:
#         for total_train_samples in [200000, 400000, 600000]:
            dfc = df.copy()
            dfc.insert(0, 'total_train_samples',  dfc['data_args.subsample_mixture'].apply(
                lambda d: sum(list(d.values())) if d else 200000))
            dfc = dfc[dfc['total_train_samples'].apply(
                lambda x: total_train_samples-20000<x<total_train_samples+20000)]
            dfc = dfc[dfc['model_args.model_name_or_path'].apply(
                lambda x: model_name_contain in x)]
            dfc['total_train_samples'] = dfc['total_train_samples'].astype(str)
            dfc = dfc.drop(columns=['model_args.model_name_or_path', 'data_args.subsample_mixture'])
            dfc = dfc.reset_index(drop=True)
            if len(dfc):
                display(dfc
                        .style
                        .set_properties(**{'text-align': 'left'})
                        .background_gradient(cmap ='coolwarm')
                        .format(precision=2))
else:
    for model_name_contain in ['llama', 'pythia-1.4b', 'mistral', 'zephyr']:
        dfc = df.copy()
        dfc = dfc[dfc['model_args.model_name_or_path'].apply(
            lambda x: model_name_contain in x.lower())]
        if not len(dfc): continue
        from rosemary import pd_average_col_contains_substr
        Ns = sorted(np.unique([int(x) for x in df['total_train_samples'].to_numpy() if x]).tolist())
        datasets = sorted(np.unique(df['dataset']).tolist())
        for N in Ns+[None]:
            for dataset in datasets:
                dfc = df.copy()
                dfc = dfc[dfc['total_train_samples'].apply(lambda x: int(x) == N if x else True)]
                dfc = dfc[dfc['dataset'].apply(lambda x: x == dataset if x else True)]
                if not len(dfc): continue
                col_runname = 'run_name' if chat_fmt != 'both' else ('run_name', '')
                substitute = True
                dfc = pd_average_col_contains_substr(dfc, col_runname, '_random_', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=10000:ep=10', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=50000:ep=5', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=150000:ep=3', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=150000:ep=1', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=100000', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=200000', substitute=substitute)
                dfc = pd_average_col_contains_substr(dfc, col_runname, 'score=random:s=\d_pace=prune:size=400000', substitute=substitute)
                #     dfc = dfc.sort_values(['ranking'], ascending=False)
                col = ('Average', 'chatfmt') if chat_fmt == 'both' else 'Average'
            #     col = 'AlpacaFarm/WR'
            #     col = 'MMLU/0-shot'|
            #     col = 'GSM/CoT'
            #     col = 'BBH/Direct'
            #     col = 'TydiQA/GP'
                dfc = dfc.sort_values(by=[col], ascending=False)
                dfc = dfc.drop(columns=['model_args.model_name_or_path', 'data_args.subsample_mixture', 'data_args.max_train_samples', 'dataset'], 
                               axis=1, level=0 if chat_fmt=='both' else None)
                dfc = dfc.reset_index(drop=True)
                display(dfc
                        .style
                        .applymap(lambda x: f'max-width: 60ch;', subset=['run_name'])
                        .set_table_styles([{'selector': 'td', 'props': [('white-space', 'pre-wrap'), ('word-wrap', 'break-word')]}])
                        .set_properties(**{'text-align': 'left'})
                        .background_gradient(cmap ='coolwarm')
                        .applymap(lambda x: 'text-decoration: underline;' \
                                  if x in dfc[list(set(dfc.columns) & set([(x, '') for x in cols]))+[col] if chat_fmt=='both' else cols+[col]].values.flatten() and chat_fmt=='both' else '')
                        .format(precision=1))

# llama-7b_tulu_v1_mix(paper)
# MMLU/0-shot, MMLU/5-shot, GSM/Direct, GSM/CoT, BBH/Direct, BBH/CoT, TydiQA/GP, TydiQA/CB, CodexEval/Pass@1, AlpacaEval(vs.Davinci-003)
# 44.8       , 47.1       , 7.0       , 25.0   , 38.5      , 38.5   , 43.5,    , 8.0      , 18.6,           , 48.6

['../results/oi5_dolly:llama-7b/llama-7b_dolly_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10/eval/alpacafarm_ann=chatgpt_chatfmt/1299941.out']
chat_fmt=True


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b,,32.8,33.5,5.4,10.8,32.1,27.4,9.7,35.4,0.0,,,20.8,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_dolly_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,36.0,32.6,3.8,10.6,30.2,26.0,6.1,28.7,10.0,36.1,347.3,22.0,-37.7
1,llama-7b_dolly_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,34.8,34.0,5.2,9.0,28.9,24.6,7.5,30.7,8.9,34.6,191.7,21.8,-38.0
2,llama-7b_dolly_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7b:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,35.6,33.3,3.8,6.2,31.9,22.1,7.9,33.8,6.7,28.0,200.8,20.9,-51.6
3,llama-7b_dolly_score=dppmap:k=vmf:gamma=auto1000:kmd=mpnet_pace=prune:size=10000:ep=10,10000.0,35.7,33.8,4.8,6.4,28.7,23.0,7.5,33.2,8.5,27.1,187.7,20.9,-47.0
4,llama-7b_dolly_ep=2,,35.7,34.3,5.0,8.6,27.4,21.7,8.1,36.1,10.6,20.5,312.5,20.8,-36.9
5,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=2),10000.0,34.3,32.7,3.9,6.2,29.4,21.6,7.0,35.5,8.8,26.5,116.8,20.6,-56.0
6,llama-7b_dolly_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,36.0,30.0,4.8,9.0,24.2,24.9,5.6,20.6,8.5,38.7,287.0,20.2,-54.9
7,llama-7b_dolly_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,33.7,33.1,6.4,8.0,22.2,21.9,6.0,30.1,7.1,33.0,233.5,20.1,-54.9


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_flan_v2_ep=2,,36.4,40.3,4.0,11.0,35.6,27.5,6.6,40.8,0.0,,,22.5,
1,llama-7b_flan_v2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,29.7,33.1,4.4,10.2,30.9,26.4,7.1,34.9,8.3,20.4,368.6,20.5,-41.4
2,llama-7b_flan_v2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,29.0,31.5,4.2,7.8,32.6,28.0,7.7,33.9,6.5,22.0,286.6,20.3,-49.4
3,llama-7b_flan_v2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,34.4,35.9,6.2,7.4,32.0,27.7,6.5,27.3,7.5,17.0,184.6,20.2,-42.5
4,llama-7b_flan_v2_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,30.6,34.4,5.4,6.6,31.6,26.6,6.4,28.6,8.9,21.0,277.9,20.0,-48.0
5,llama-7b_flan_v2_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7b:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,36.5,35.0,5.6,6.8,30.1,23.1,8.0,34.4,10.0,10.6,118.1,20.0,-43.3
6,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=2),10000.0,30.5,26.6,4.7,5.5,30.5,26.1,7.7,34.3,3.7,9.6,41.7,17.9,-66.7


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_oasst1_ep=2,,33.5,32.2,4.6,7.6,32.7,29.3,7.0,24.6,8.1,44.9,312.8,22.5,-35.5
1,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=2),10000.0,34.0,31.8,5.5,8.7,25.0,22.0,5.2,18.7,6.3,44.1,272.6,20.1,-60.9
2,llama-7b_oasst1_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7b:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,33.3,31.8,4.2,7.2,17.1,25.4,5.1,22.1,5.3,44.5,338.4,19.6,-65.6
3,llama-7b_oasst1_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,34.2,32.4,5.2,6.2,12.6,20.9,5.1,19.2,8.9,42.6,342.4,18.7,-62.5
4,llama-7b_oasst1_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,32.9,31.9,5.4,8.4,14.2,22.0,4.9,16.3,5.1,42.4,245.3,18.3,-70.7
5,llama-7b_oasst1_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,35.1,33.5,3.6,7.2,12.7,21.4,5.1,15.2,6.3,40.7,331.0,18.1,-68.6
6,llama-7b_oasst1_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,33.3,33.2,3.8,6.4,7.3,14.3,4.8,18.2,8.5,43.5,312.3,17.3,-70.5


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_sharegptv2_ep=2,,43.0,39.6,4.6,10.2,32.4,26.4,6.2,22.3,5.5,52.4,339.1,24.3,-28.8
1,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,33.9,31.5,5.2,9.8,29.4,22.9,6.4,21.1,5.1,48.1,281.2,21.3,-51.4
2,llama-7b_sharegptv2_score=log:prob:neg_pace=prune:size=10000:ep=10,10000.0,36.6,34.6,6.6,8.0,33.3,27.2,7.6,29.0,8.5,,,21.3,
3,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,36.9,33.0,5.4,9.0,23.0,27.1,5.4,15.2,6.5,48.2,367.7,21.0,-44.2
4,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=2),10000.0,34.4,31.7,4.9,7.3,24.0,25.2,6.0,20.4,5.2,50.0,286.0,20.9,-54.9
5,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,35.4,31.6,4.0,8.8,22.5,17.6,6.1,22.1,4.1,48.5,327.5,20.1,-61.1
6,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,31.8,30.8,4.8,7.2,21.7,22.3,5.6,20.0,6.1,45.8,359.1,19.6,-65.0
7,llama-7b_sharegptv2_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7b:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,32.0,31.1,5.2,6.4,14.2,26.4,3.7,17.7,7.9,48.6,284.7,19.3,-64.2
8,llama-7b_sharegptv2_score=el2n:agg=mean_pace=prune:size=10000:ep=10,10000.0,28.7,30.0,6.0,7.8,33.1,26.9,7.0,26.2,6.3,,,19.1,
9,llama-7b_sharegptv2_score=grad:loraB:l2n_pace=prune:size=10000:ep=10,10000.0,33.5,33.3,4.8,7.4,27.9,26.1,6.6,23.1,7.3,,,18.9,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,33.2,33.8,5.8,8.2,32.9,27.5,8.2,33.4,5.9,40.7,255.4,23.0,-31.8
1,llama-7b_stanford_alpaca_score=grad:loraB:l2n_pace=prune:size=10000:ep=10,10000.0,30.3,31.9,7.0,8.4,31.9,25.8,8.2,33.7,9.1,39.0,393.6,22.5,-30.8
2,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br256p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,34.9,33.8,5.2,7.2,33.9,25.2,6.5,31.6,6.1,40.1,263.8,22.4,-39.5
3,llama-7b_stanford_alpaca_score=ifd:neg_pace=prune:size=10000:ep=10,10000.0,35.3,33.3,4.2,9.0,33.1,26.2,7.2,34.3,6.3,35.2,199.6,22.4,-38.0
4,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,31.0,32.6,4.6,8.0,32.0,27.1,7.0,30.9,7.3,39.5,164.5,22.0,-44.1
5,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,31.0,31.2,4.6,7.6,30.6,26.6,6.7,33.0,6.9,39.0,279.1,21.7,-48.1
6,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,29.4,30.0,4.8,7.0,31.0,26.1,7.2,32.9,5.9,39.5,161.8,21.4,-53.8
7,llama-7b_stanford_alpaca_ep=2,,36.7,36.7,3.6,4.8,33.0,25.8,5.1,27.7,5.9,33.2,173.7,21.3,-54.7
8,llama-7b_stanford_alpaca_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7b:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,32.9,32.3,6.0,7.8,31.7,28.5,5.6,22.1,8.9,35.3,211.0,21.1,-46.7
9,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto1000:kmd=pythia1br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,31.4,30.0,4.4,8.6,32.9,24.9,5.4,25.4,8.5,36.6,185.3,20.8,-56.2


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=1),10000,38.3,34.8,6.6,8.6,19.4,22.2,6.5,22.9,10.2,49.6,277.9,21.9,-35.1
1,llama-7b_ultrachat200kv2_score=dppmap:k=vmf:gamma=auto1000:kmd=mpnet_pace=prune:size=10000:ep=10,10000,37.7,33.4,6.2,7.4,18.3,21.2,7.7,25.4,6.7,49.1,285.0,21.3,-41.4
2,llama-7b_ultrachat200kv2_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=10000:ep=10,10000,34.7,33.1,4.6,5.4,26.4,23.7,7.4,23.7,6.1,46.9,260.4,21.2,-52.0
3,llama-7b_ultrachat200kv2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000,38.1,34.1,4.8,10.6,21.0,18.8,6.5,24.0,7.5,44.3,233.4,21.0,-46.2
4,llama-7b_ultrachat200kv2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000,37.7,32.5,5.2,6.8,20.3,21.1,6.1,23.1,5.3,44.3,215.0,20.2,-60.4


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_wizardlmv2_ep=2,,42.7,38.8,4.0,11.0,30.1,27.4,6.0,21.5,16.3,47.9,271.9,24.6,-30.7
1,llama-7b_wizardlmv2_score=dppmap:k=vmf:gamma=auto1000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,35.0,30.0,4.4,11.0,25.3,27.5,6.5,24.5,8.3,46.5,286.3,21.9,-41.1
2,llama-7b_wizardlmv2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,35.5,33.6,4.2,9.4,25.9,27.8,5.9,20.3,6.1,46.0,315.5,21.5,-44.1
3,score=random:s=\d_pace=prune:size=10000:ep=10_avg (N=1),10000.0,30.4,27.3,6.0,8.0,29.2,31.1,6.4,23.2,7.1,44.9,307.3,21.4,-45.2
4,llama-7b_wizardlmv2_score=dppmap:k=rbf:gamma=auto1000:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=10000:ep=10,10000.0,33.9,32.8,4.4,9.4,26.2,27.7,5.5,21.1,7.3,45.1,338.7,21.3,-45.1
5,llama-7b_wizardlmv2_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=10000:ep=10,10000.0,34.8,31.1,3.4,11.4,19.4,27.3,6.7,24.2,8.9,45.5,274.7,21.3,-46.7


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b,,32.8,33.5,5.4,10.8,32.1,27.4,9.7,35.4,0.0,,,20.8,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_dolly_score=dppmap:k=rbf:gamma=1e-3:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=20000:ep=4,20000.0,35.3,31.7,4.4,7.8,30.0,24.7,7.1,38.5,7.5,32.5,176.0,21.9,-46.5
1,llama-7b_dolly_score=random:s=0_pace=prune:size=20000:ep=4,20000.0,35.5,34.2,5.8,8.0,27.3,22.7,7.0,34.9,6.3,29.6,232.5,21.1,-44.0
2,llama-7b_dolly_ep=2,,35.7,34.3,5.0,8.6,27.4,21.7,8.1,36.1,10.6,20.5,312.5,20.8,-36.9
3,llama-7b_dolly_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=20000:ep=4,20000.0,33.5,32.2,5.8,8.8,21.6,25.1,6.0,25.0,7.1,36.4,270.9,20.1,-53.8


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_flan_v2_ep=2,,36.4,40.3,4.0,11.0,35.6,27.5,6.6,40.8,0.0,,,22.5,
1,llama-7b_flan_v2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=20000:ep=4,20000.0,32.5,35.7,5.0,12.6,34.3,26.0,6.9,35.1,9.3,20.9,369.7,21.8,-27.4
2,llama-7b_flan_v2_score=dppmap:k=rbf:gamma=1e-3:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=20000:ep=4,20000.0,38.9,37.1,3.8,7.2,33.5,26.4,6.2,35.5,7.3,8.7,153.8,20.5,-44.9
3,llama-7b_flan_v2_score=random:s=0_pace=prune:size=20000:ep=4,20000.0,27.9,31.4,4.8,8.0,33.0,25.9,7.3,35.6,8.1,8.9,68.0,19.1,-53.0


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_oasst1_ep=2,,33.5,32.2,4.6,7.6,32.7,29.3,7.0,24.6,8.1,44.9,312.8,22.5,-35.5


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_sharegptv2_ep=2,,43.0,39.6,4.6,10.2,32.4,26.4,6.2,22.3,5.5,52.4,339.1,24.3,-28.8


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_stanford_alpaca_score=dppmap:k=rbf:gamma=1e-3:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=20000:ep=4,20000.0,32.8,32.3,3.8,7.0,32.6,28.8,7.2,32.1,6.9,39.5,137.0,22.3,-46.1
1,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=20000:ep=4,20000.0,36.2,34.3,4.0,4.8,32.6,25.6,6.0,30.7,5.5,40.8,188.6,22.0,-49.7
2,llama-7b_stanford_alpaca_ep=2,,36.7,36.7,3.6,4.8,33.0,25.8,5.1,27.7,5.9,33.2,173.7,21.3,-54.7
3,llama-7b_stanford_alpaca_score=random:s=0_pace=prune:size=20000:ep=4,20000.0,34.0,32.4,4.4,5.8,30.8,23.6,7.0,26.6,6.7,35.6,160.3,20.7,-57.5


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_ultrachat200kv2_score=dppmap:k=rbf:gamma=1e-3:kmd=llama7br512p4096:kemb=text+embedding_pace=prune:size=20000:ep=4,20000,39.0,36.2,3.6,10.4,30.6,25.0,7.7,27.2,7.3,47.5,234.0,23.4,-32.5
1,llama-7b_ultrachat200kv2_score=dppmap:k=vmf:gamma=1:kmd=llama7br512p4096:kemb=grad+rp+loraB_pace=prune:size=20000:ep=4,20000,38.7,35.5,5.0,8.4,27.6,23.2,6.0,21.3,6.9,49.8,284.8,22.2,-39.8
2,llama-7b_ultrachat200kv2_score=random:s=0_pace=prune:size=20000:ep=4,20000,36.4,32.3,5.4,8.6,18.0,20.6,6.4,22.3,9.3,48.7,271.3,20.8,-48.1


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_wizardlmv2_ep=2,,42.7,38.8,4.0,11.0,30.1,27.4,6.0,21.5,16.3,47.9,271.9,24.6,-30.7


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b,,32.8,33.5,5.4,10.8,32.1,27.4,9.7,35.4,0.0,,,20.8,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_dolly_ep=2,,35.7,34.3,5.0,8.6,27.4,21.7,8.1,36.1,10.6,20.5,312.5,20.8,-36.9


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_flan_v2_ep=2,,36.4,40.3,4.0,11.0,35.6,27.5,6.6,40.8,0.0,,,22.5,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_oasst1_ep=2,,33.5,32.2,4.6,7.6,32.7,29.3,7.0,24.6,8.1,44.9,312.8,22.5,-35.5


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_sharegptv2_ep=2,,43.0,39.6,4.6,10.2,32.4,26.4,6.2,22.3,5.5,52.4,339.1,24.3,-28.8
1,llama-7b_sharegptv2_score=el2n:agg=mean_pace=prune:size=50000:ep=5,50000.0,39.8,38.9,4.6,11.6,25.3,24.4,6.7,23.8,5.1,51.0,279.1,23.1,-35.2
2,llama-7b_sharegptv2_score=log:prob:neg_pace=prune:size=50000:ep=5,50000.0,42.7,41.3,4.4,8.4,24.2,21.5,6.7,21.9,4.9,50.4,311.9,22.6,-41.7
3,score=random:s=\d_pace=prune:size=50000:ep=5_avg (N=2),50000.0,37.9,33.8,5.2,8.5,24.5,24.0,5.2,19.0,5.1,53.0,309.4,21.6,-46.1
4,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto10000:kmd=mpnet_pace=prune:size=50000:ep=5,50000.0,35.7,34.8,5.4,10.4,22.9,20.0,5.3,18.7,4.9,53.8,299.5,21.2,-49.8
5,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=50000:ep=5,50000.0,40.1,35.7,5.8,10.4,15.6,20.8,4.9,21.4,4.5,52.5,326.8,21.2,-46.6
6,llama-7b_sharegptv2_score=dppmap:k=vmf:gamma=auto10000:kmd=llama7b:kemb=text+embedding_pace=prune:size=50000:ep=5,50000.0,36.4,32.3,4.4,5.8,22.3,23.4,6.0,22.5,7.1,51.2,313.8,21.1,-52.5
7,llama-7b_sharegptv2_score=log:pmi:neg_pace=prune:size=50000:ep=5,50000.0,33.8,30.9,4.2,7.0,31.7,26.1,5.5,16.7,4.7,46.0,264.5,20.7,-62.3
8,llama-7b_sharegptv2_score=dppmap:k=rbf:gamma=auto10000:kmd=llama7b:kemb=text+embedding_pace=prune:size=50000:ep=5,50000.0,38.5,33.3,5.4,6.2,25.6,20.3,5.1,19.0,4.1,49.0,321.5,20.6,-57.3
9,llama-7b_sharegptv2_score=grad:loraB:l2n_pace=prune:size=50000:ep=5,50000.0,37.6,34.5,4.2,6.6,21.1,21.6,5.0,19.4,2.8,53.3,326.1,20.6,-59.0


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_stanford_alpaca_ep=2,,36.7,36.7,3.6,4.8,33.0,25.8,5.1,27.7,5.9,33.2,173.7,21.3,-54.7


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_wizardlmv2_ep=2,,42.7,38.8,4.0,11.0,30.1,27.4,6.0,21.5,16.3,47.9,271.9,24.6,-30.7


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b,,32.8,33.5,5.4,10.8,32.1,27.4,9.7,35.4,0.0,,,20.8,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_dolly_ep=2,,35.7,34.3,5.0,8.6,27.4,21.7,8.1,36.1,10.6,20.5,312.5,20.8,-36.9


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_flan_v2_ep=2,,36.4,40.3,4.0,11.0,35.6,27.5,6.6,40.8,0.0,,,22.5,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_oasst1_ep=2,,33.5,32.2,4.6,7.6,32.7,29.3,7.0,24.6,8.1,44.9,312.8,22.5,-35.5
1,llama-7b_oasst1_score=dppmap:k=vmf:gamma=auto20000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=60000:ep=3,60000.0,33.1,33.6,4.2,7.4,26.3,27.6,6.6,25.0,6.9,47.0,296.9,21.8,-43.9
2,llama-7b_oasst1_score=dppmap:k=vmf:gamma=auto20000:kmd=mpnet_pace=prune:size=60000:ep=3,60000.0,32.0,32.0,5.2,11.2,24.4,25.3,6.0,21.8,7.3,48.3,280.9,21.4,-48.2
3,llama-7b_oasst1_score=dppmap:k=rbf:gamma=auto20000:kmd=llama7b:kemb=text+embedding_pace=prune:size=60000:ep=3,60000.0,34.5,33.5,3.8,8.4,23.9,25.1,6.0,26.2,4.9,45.2,302.7,21.1,-53.2
4,llama-7b_oasst1_score=random:s=0_pace=prune:size=60000:ep=3,60000.0,33.8,32.5,5.6,9.6,24.7,25.2,5.6,22.9,6.9,43.4,306.2,21.0,-47.7


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_sharegptv2_ep=2,,43.0,39.6,4.6,10.2,32.4,26.4,6.2,22.3,5.5,52.4,339.1,24.3,-28.8


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_stanford_alpaca_score=random:s=0_pace=prune:size=60000:ep=3,60000.0,39.1,37.1,4.8,6.0,33.0,25.2,6.3,32.4,8.1,38.9,119.1,23.1,-38.6
1,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto20000:kmd=llama7b:kemb=grad+rp+loraB_pace=prune:size=60000:ep=3,60000.0,36.3,34.3,5.0,6.4,29.4,27.8,5.6,29.1,9.1,41.0,198.5,22.4,-39.1
2,llama-7b_stanford_alpaca_score=dppmap:k=rbf:gamma=auto20000:kmd=llama7b:kemb=text+embedding_pace=prune:size=60000:ep=3,60000.0,37.8,33.8,3.4,4.8,31.2,26.8,5.9,29.5,8.3,40.5,202.9,22.2,-45.7
3,llama-7b_stanford_alpaca_ep=2,,36.7,36.7,3.6,4.8,33.0,25.8,5.1,27.7,5.9,33.2,173.7,21.3,-54.7
4,llama-7b_stanford_alpaca_score=dppmap:k=vmf:gamma=auto20000:kmd=mpnet_pace=prune:size=60000:ep=3,60000.0,32.7,32.2,4.8,4.4,30.3,23.0,6.8,33.8,6.1,35.4,136.5,21.0,-60.1


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_wizardlmv2_ep=2,,42.7,38.8,4.0,11.0,30.1,27.4,6.0,21.5,16.3,47.9,271.9,24.6,-30.7


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b,,32.8,33.5,5.4,10.8,32.1,27.4,9.7,35.4,0.0,,,20.8,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_dolly_ep=2,,35.7,34.3,5.0,8.6,27.4,21.7,8.1,36.1,10.6,20.5,312.5,20.8,-36.9


  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_flan_v2_ep=2,,36.4,40.3,4.0,11.0,35.6,27.5,6.6,40.8,0.0,,,22.5,


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_oasst1_ep=2,,33.5,32.2,4.6,7.6,32.7,29.3,7.0,24.6,8.1,44.9,312.8,22.5,-35.5


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_sharegptv2_ep=2,,43.0,39.6,4.6,10.2,32.4,26.4,6.2,22.3,5.5,52.4,339.1,24.3,-28.8


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_stanford_alpaca_ep=2,,36.7,36.7,3.6,4.8,33.0,25.8,5.1,27.7,5.9,33.2,173.7,21.3,-54.7


Unnamed: 0,run_name,total_train_samples,MMLU/0-shot,MMLU/5-shot,GSM/Direct,GSM/CoT,BBH/Direct,BBH/CoT,TydiQA/CB,TydiQA/GP,Codex-Eval/Pass@1,AlpacaFarm/WR*,AlpacaFarm/Len,Average,ranking
0,llama-7b_wizardlmv2_ep=2,,42.7,38.8,4.0,11.0,30.1,27.4,6.0,21.5,16.3,47.9,271.9,24.6,-30.7


1