In [26]:
!nvidia-smi

Thu Jul 13 11:57:16 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:8D:00.0 Off |                    0 |
| N/A   25C    P0    56W / 400W |   8930MiB / 40960MiB |      0%   E. Process |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  On   | 00000000:C7:00.0 Off |                    0 |
| N/A   25C    P0    58W / 400W |   8932MiB / 40960MiB |      0%   E. Proces

In [27]:
import os
from rosemary import jpt_setup; jpt_setup()

from llm.submit import multiline_to_singleline, submit_job_ccc, get_run_statistics
import pandas as pd
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem usage (GB)', 'per-epoch time (hr)', 'per-iter time (s)']

# 55k data points, batch_size=128
data_oasst1 = [
    ('gpt2', 0.124, 'bf16', 'no', 10, None, None),
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 2.5, 11),
    # incorporate deep speed is costly!
    ('gpt2-Large', 0.774, 'bf16', 'stage 3 no offloading', 40, 6, 25),
    # 1 a100_40g: without offloading OOM on `.backward()`, runs fine with offloading.
    ('gpt2-xl', 1.5, 'bf16', 'stage 3 with offloading', 40, 13, 55),
    # 4 v100_32g: without offloading.
]

df_oasst1 = pd.DataFrame(data_oasst1, columns=cols)

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem (GB)', 'cpu mem (GB)', 'per-epoch time (hr)', 'per-iter time (s)']
data = [
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 7, 9.5, 9),
]

print('instruction tune human-mix on 1 a100_40g:')
df = pd.DataFrame(data, columns=cols)
df

instruction tune human-mix on 1 a100_40g:


Unnamed: 0,model,size,mixed-precision,deepspeed,gpu mem (GB),cpu mem (GB),per-epoch time (hr),per-iter time (s)
0,gpt2-Large,0.774,bf16,no,36,7,9.5,9


# Finetuning with openinstruct/finetune.py


In [4]:
shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}

[ ! -f "{log_dir}/${{LSB_JOBID}}.out" ] || mv "{log_dir}/${{LSB_JOBID}}.out" "{save_dir}"
"""



In [13]:
job_name = 'ft'
test_run = True

queue = 'x86_12h' # 'x86_12h'
num_cpus = 20
num_gpus = 1
cpu_mem = 32
require = 'a100_80gb'

# model_name_or_path = 'mosaicml/mpt-7b'; max_seq_length = 2048
# model_name_or_path = 'gpt2'; max_seq_length = 1024
# model_name_or_path = 'gpt2-Large'; max_seq_length = 1024
# model_name_or_path = 'gpt2-xl'; max_seq_length = 1024
model_name_or_path = 'huggyllama/llama-7b'; max_seq_length = 2048


train_file = 'data/processed/oasst1/oasst1_data.jsonl'; train_file_short = 'oasst1'
train_file = 'data/processed/flanv2_cot_oasst1_dolly.jsonl'; train_file_short = 'human_mix'
# train_file = 'data/processed/flanv2_cot_oasst1_dolly_shuffled.jsonl'; train_file_short = 'human_mix_shuffled'

output_dir = f"results/{model_name_or_path.replace('/', ':')}_{train_file_short}"
if test_run:
    output_dir = 'jpt_' + output_dir

use_deepspeed = False
# deepspeed_config_file = 'ds_configs/stage3_no_offloading_accelerate_setauto.conf'
# deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate.conf'
deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate_setauto.conf'

use_lora = True
lora_rank = 4
lora_alpha = lora_rank
lora_dropout = 0.05

batch_size_per_gpu = 1
total_batch_size = 128
mixed_precision = 'bf16' # 'bf16', 'fp16'
checkpointing_steps = None # every n steps, where n='1' or every 'epoch'

gradient_acc_steps = int(total_batch_size/num_gpus/batch_size_per_gpu)

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{batch_size_per_gpu} batch size per GPU, "
      f"{gradient_acc_steps} gradient accumulation steps.")

# do use fast tokenizer since mpt-7b does not have a fast tokenizer counter-part
#     --use_slow_tokenizer \
# do not use flash attention, since having problem installing flash-attn with cuda 12.1
#     --use_flash_attn \

cmd = f"""
{'!cd .. && ' if test_run else ''}accelerate launch \
    --mixed_precision {mixed_precision} \
    --num_machines 1 \
    --num_processes {num_gpus} \
    {'--use_deepspeed' if use_deepspeed else ''}
    {'--deepspeed_config_file '+deepspeed_config_file if use_deepspeed else ''}
    open_instruct/finetune.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    {'--use_lora' if use_lora else ''}
    --lora_rank {lora_rank} \
    --lora_alpha {lora_alpha} \
    --lora_dropout {lora_dropout} \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size {batch_size_per_gpu} \
    --gradient_accumulation_steps {gradient_acc_steps} \
    --learning_rate 2e-5 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.03 \
    --weight_decay 0. \
    --num_train_epochs 2 \
    --output_dir {output_dir} \
    --with_tracking \
    --report_to tensorboard \
    {'--checkpointing_steps '+str(checkpointing_steps) if checkpointing_steps else ''}
    --logging_steps 1
"""

# things to test to see its effects on (1) eval perf (2) runtime.
#
# - int8
# - mixed_precision bf16 or no
# - with/without LoRA
# - LoRA's rank/alpha (alpha typically set to 2*rank)
# - batch size
# - micro-batch size (largest without running out of memory)


cmd = multiline_to_singleline(cmd)
if test_run:
    print()
    print(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job_ccc(
    shell_scripts, 
    job_name=job_name, 
    queue=queue,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    require=require,
    num_gpus=num_gpus,
    test_run=test_run,
)
if not test_run:
    print(out)

Training huggyllama/llama-7b using 1 GPUs, 1 batch size per GPU, 128 gradient accumulation steps.

!cd .. && accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 1 open_instruct/finetune.py --model_name_or_path huggyllama/llama-7b --tokenizer_name huggyllama/llama-7b --train_file data/processed/flanv2_cot_oasst1_dolly.jsonl --max_seq_length 2048 --use_lora --lora_rank 4 --lora_alpha 4 --lora_dropout 0.05 --preprocessing_num_workers 16 --per_device_train_batch_size 1 --gradient_accumulation_steps 128 --learning_rate 2e-5 --lr_scheduler_type linear --warmup_ratio 0.03 --weight_decay 0. --num_train_epochs 2 --output_dir results/huggyllama:llama-7b_human_mixjpt_results/huggyllama:llama-7b_human_mix --with_tracking --report_to tensorboard --logging_steps 1


In [15]:
# llama7b+lora, micro-bsz=1, bsz=128, 
#     66gb gpu mem, in it 28gb for torch tensor. so fit on 1 a100_80gb

#   1%|▏         | 57/4222 [22:24<25:53:47, 22.38s/it]07/11/2023 23:13:55 - INFO - 
# __main__ -   Step: 57, LR: 9.047619047619049e-06, Loss: 2.2924644947052
# 


!cd .. && accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 1 open_instruct/finetune.py --model_name_or_path huggyllama/llama-7b --tokenizer_name huggyllama/llama-7b --train_file data/processed/flanv2_cot_oasst1_dolly.jsonl --max_seq_length 2048 --use_lora --lora_rank 4 --lora_alpha 4 --lora_dropout 0.05 --preprocessing_num_workers 16 --per_device_train_batch_size 1 --gradient_accumulation_steps 128 --learning_rate 2e-5 --lr_scheduler_type linear --warmup_ratio 0.03 --weight_decay 0. --num_train_epochs 2 --output_dir results/huggyllama:llama-7b_human_mixjpt_results/huggyllama:llama-7b_human_mix --with_tracking --report_to tensorboard --logging_steps 1





The following values were not passed to `accelerate launch` and had defaults used instead:
	`--dynamo_backend` was set to a value of `'no'`

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...
07/12/2023 12:59:09 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: bf16

100%|████████████████████████████████████████████| 1/1

loading file tokenizer.model from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer.model
loading file tokenizer.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/special_tokens_map.json
loading file tokenizer_config.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer_config.json
loading weights file model.safetensors from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/model.safetensors.index.json
Instantiating LlamaForCausalLM model unde

07/12/2023 13:01:07 - INFO - __main__ - ***** Running training *****
07/12/2023 13:01:07 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) =  Num examples = 270152
07/12/2023 13:01:07 - INFO - __main__ -   Num Epochs = 2
07/12/2023 13:01:07 - INFO - __main__ -   Instantaneous batch size per device = 1
07/12/2023 13:01:07 - INFO - __main__ -  128
07/12/2023 13:01:07 - INFO - __main__ -   Gradient Accumulation steps = 128
07/12/2023 13:01:07 - INFO - __main__ -   Total optimization steps = 4222
  0%|                                                  | 0/4222 [00:00<?, ?it/s]before train loop:
GPU memory occupied: 14747 MB.
torch.cuda.memory_allocated():  13552361472
model.device: cuda:0
model.dtype: torch.bfloat16
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          

  1%|▎                                     | 29/4222 [11:00<25:30:06, 21.90s/it]07/12/2023 13:12:08 - INFO - __main__ -   Step: 29, LR: 4.603174603174604e-06, Loss: 2.1846425533294678
GPU memory occupied: 65457 MB.
torch.cuda.memory_allocated():  13609034752
  1%|▎                                     | 30/4222 [11:24<26:26:01, 22.70s/it]07/12/2023 13:12:32 - INFO - __main__ -   Step: 30, LR: 4.761904761904762e-06, Loss: 2.024883270263672
GPU memory occupied: 65457 MB.
torch.cuda.memory_allocated():  13641553920
  1%|▎                                     | 31/4222 [11:49<27:12:57, 23.38s/it]07/12/2023 13:12:57 - INFO - __main__ -   Step: 31, LR: 4.920634920634921e-06, Loss: 2.1247711181640625
GPU memory occupied: 65457 MB.
torch.cuda.memory_allocated():  13728229888
  1%|▎                                     | 32/4222 [12:10<26:22:46, 22.67s/it]07/12/2023 13:13:18 - INFO - __main__ -   Step: 32, LR: 5.07936507936508e-06, Loss: 2.2556421756744385
GPU memory occupied: 65457 MB.
torch.cuda

  1%|▌                                     | 61/4222 [23:21<26:16:36, 22.73s/it]07/12/2023 13:24:29 - INFO - __main__ -   Step: 61, LR: 9.682539682539683e-06, Loss: 1.951716661453247
GPU memory occupied: 65969 MB.
torch.cuda.memory_allocated():  13657301504



KeyboardInterrupt



# Finetuning with openinstruct/finetune_trainer.py


In [None]:
# llama7b-lora, micro-bsz=1, lora_rank=4: 66gb gpu mem
# - 17hr to run 1 epoch 
# llama7b-lora-int8, micro-bsz=1, lora_rank=4: 13gb gpu mem
#     then why is training without int8 require so much more memory!


In [12]:
job_name = 'ft-trainer'
test_run = True

queue = 'x86_12h' # 'x86_12h'
num_cpus = 32
num_gpus = 1
cpu_mem = 32
require = 'a100_80gb'

save_strategy = 'steps'
save_steps = 200

model_name_or_path = 'huggyllama/llama-7b'; max_seq_length = 2048
train_file = 'data/processed/flanv2_cot_oasst1_dolly.jsonl'; train_file_short = 'human_mix'
output_dir = f"results/{model_name_or_path.replace('/', ':')}_{train_file_short}-trainer"
if num_gpus != 1:
    output_dir += f'_ngpus={num_gpus}'
if test_run:
    output_dir = 'jpt_' + output_dir

num_train_epochs = 1
batch_size_per_gpu = 1
total_batch_size = 128
mixed_precision = 'bf16' # 'bf16', 'fp16'

load_in_8bit = False

use_lora = True
lora_rank = 4
lora_alpha = lora_rank
lora_dropout = 0.05

gradient_acc_steps = int(total_batch_size/num_gpus/batch_size_per_gpu)

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{batch_size_per_gpu} batch size per GPU, "
      f"{gradient_acc_steps} gradient accumulation steps.")

cmd = f"""
{'!cd .. && ' if test_run else ''}accelerate launch \
    --mixed_precision {mixed_precision} \
    {'--multi_gpu' if num_gpus>1 else ''} 
    --num_machines 1 \
    --num_processes {num_gpus} \
    open_instruct/finetune_trainer.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    --use_fast_tokenizer True \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    {'--use_lora' if use_lora else ''}
    --lora_rank {lora_rank} \
    --lora_alpha {lora_alpha} \
    --lora_dropout {lora_dropout} \
    {'--load_in_8bit' if load_in_8bit else ''} \
    --do_train \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size {batch_size_per_gpu} \
    --gradient_accumulation_steps {gradient_acc_steps} \
    --learning_rate 2e-5 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.03 \
    --weight_decay 0. \
    --evaluation_strategy "no" \
    --logging_steps 1 \
    --save_strategy {save_strategy} \
    --save_steps {save_steps} \
    --save_total_limit 11 \
    --num_train_epochs {num_train_epochs} \
    --output_dir {output_dir} \
    --bf16 \
    --tf32 True \
    --overwrite_output_dir \
    --report_to tensorboard \
    --torch_dtype bfloat16 \
    --dataloader_num_workers 8
"""


cmd = multiline_to_singleline(cmd)
if test_run:
    print()
    print(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job_ccc(
    shell_scripts, 
    job_name=job_name, 
    queue=queue,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    require=require,
    num_gpus=num_gpus,
    test_run=test_run,
)
if not test_run:
    print(out)


Training huggyllama/llama-7b using 1 GPUs, 1 batch size per GPU, 128 gradient accumulation steps.


In [9]:
!cd .. && accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 1 open_instruct/finetune_trainer.py --model_name_or_path huggyllama/llama-7b --tokenizer_name huggyllama/llama-7b --use_fast_tokenizer True --train_file data/processed/flanv2_cot_oasst1_dolly.jsonl --max_seq_length 2048 --use_lora --lora_rank 4 --lora_alpha 4 --lora_dropout 0.05 --do_train --preprocessing_num_workers 16 --per_device_train_batch_size 1 --gradient_accumulation_steps 128 --learning_rate 2e-5 --lr_scheduler_type linear --warmup_ratio 0.03 --weight_decay 0. --evaluation_strategy "no" --logging_steps 1 --save_strategy steps --save_steps 1 --save_total_limit 1 --num_train_epochs 1 --output_dir results/huggyllama:llama-7b_human_mix-trainer_savebystep_jpt --bf16 --tf32 True --overwrite_output_dir --report_to tensorboard --torch_dtype bfloat16 --dataloader_num_workers 8

# 


The following values were not passed to `accelerate launch` and had defaults used instead:
	`--dynamo_backend` was set to a value of `'no'`

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...
07/12/2023 11:37:39 - INFO - __main__ - Training parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=False,


07/12/2023 11:37:39 - INFO - datasets.builder - Using custom data configuration default-247ebf1b4910b0d3
07/12/2023 11:37:39 - INFO - datasets.info - Loading Dataset Infos from /dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/json
07/12/2023 11:37:39 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
07/12/2023 11:37:39 - INFO - datasets.info - Loading Dataset info from /dccstor/mit_fm/wpq/hf_cache/datasets/json/default-247ebf1b4910b0d3/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
07/12/2023 11:37:39 - INFO - datasets.info - Loading Dataset info from /dccstor/mit_fm/wpq/hf_cache/datasets/json/default-247ebf1b4910b0d3/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
100%|█████████████████████████████████████████████| 1/1 [00:07<00:00,  7.30s/it]
[INFO|configuration_utils.py:669] 2023-07-12 11:37:47,066 >> loading configuration file config.json from cac

07/12/2023 11:39:57 - INFO - datasets.arrow_dataset - Concatenating 16 shards
[INFO|trainer.py:776] 2023-07-12 11:40:19,961 >> The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: messages, id, dataset. If messages, id, dataset are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
[INFO|trainer.py:1786] 2023-07-12 11:40:20,019 >> ***** Running training *****
[INFO|trainer.py:1787] 2023-07-12 11:40:20,019 >>   Num examples = 270,679
[INFO|trainer.py:1788] 2023-07-12 11:40:20,019 >>   Num Epochs = 1
[INFO|trainer.py:1789] 2023-07-12 11:40:20,019 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:1790] 2023-07-12 11:40:20,019 >>   Total train batch size (w. parallel, distributed & accumulation) = 128
[INFO|trainer.py:1791] 2023-07-12 11:40:20,019 >>   Gradient Accumulation steps = 128
[INFO|trainer.py:1792] 2023-07-12 11:40:20,019 >>   Total optimization steps = 

{'loss': 2.2841, 'learning_rate': 1.8750000000000003e-06, 'epoch': 0.0}         
  0%|                                       | 6/2114 [02:33<13:16:59, 22.68s/it][INFO|trainer.py:2926] 2023-07-12 11:42:53,163 >> Saving model checkpoint to results/huggyllama:llama-7b_human_mix-trainer_savebystep_jpt/checkpoint-6
[INFO|tokenization_utils_base.py:2194] 2023-07-12 11:42:53,211 >> tokenizer config file saved in results/huggyllama:llama-7b_human_mix-trainer_savebystep_jpt/checkpoint-6/tokenizer_config.json
[INFO|tokenization_utils_base.py:2201] 2023-07-12 11:42:53,211 >> Special tokens file saved in results/huggyllama:llama-7b_human_mix-trainer_savebystep_jpt/checkpoint-6/special_tokens_map.json
[INFO|trainer.py:3013] 2023-07-12 11:42:53,339 >> Deleting older checkpoint [results/huggyllama:llama-7b_human_mix-trainer_savebystep_jpt/checkpoint-5] due to args.save_total_limit
{'loss': 2.1065, 'learning_rate': 2.1875000000000002e-06, 'epoch': 0.0}         
  0%|▏                                  


KeyboardInterrupt



# eval



In [4]:
shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}

[ ! -f "{log_dir}/${{LSB_JOBID}}.out" ] || mv "{log_dir}/${{LSB_JOBID}}.out" "{save_dir}"
"""

In [5]:
if any(x in model_name_or_path for x in ['small', 'base', 'medium', 'large']):
    cpu_mem = 2
elif any(x in model_name_or_path for x in ['3b']):
    cpu_mem = 15
elif any(x in model_name_or_path for x in ['7b', '11b', 'xl', 'xxl']):
    cpu_mem = 64

NameError: name 'model_name_or_path' is not defined

In [5]:
models = ['results/huggyllama:llama-7b_human_mix-trainer_savebystep/']
models = [glob.glob(os.path.join(x, 'checkpoint*')) for x in models]
models = [x for l in models for x in l]
models

NameError: name 'glob' is not defined

In [24]:
# mmlu, 0-shot
# gsm, 8-shot
# bbh, 0-shot
# bbh, 3-shot
# 

False

In [29]:
# task_name = 'bbh_s=0'
# task_name = 'bbh_s=3'
# task_name = 'gsm'
task_name = 'mmlu'
job_name = f'eval.{task_name}'

test_run = 0
test_run = bool(test_run)

batch_size = 10
if task_name == 'gsm':
    queue = 'x86_6h'
if task_name == 'bbh_s=0':
    queue = 'x86_1h'
if task_name == 'bbh_s=3':
    queue = 'x86_6h'
    batch_size = 5 # for longer prompts.
if task_name == 'mmlu':
    queue = 'x86_1h'
    batch_size = 10
    
num_cpus = 10
cpu_mem = 32 # mem usage quite small for llama7b+lora on bbh

use_chat_format = False

models = []
# models += ['t5-small', 't5-base', 't5-large', 't5-3b', 't5-11b']
# models += ['t5-11b']
# models += ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', 'google/flan-t5-xxl']
# models += ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
models += ['huggyllama/llama-7b'] # , 'mosaicml/mpt-7b'
models = [os.path.join('results/baselines', x) for x in models]


# models += ['results/huggyllama:llama-7b_human_mix-trainer_savebystep/']
# models = [glob.glob(os.path.join(x, 'checkpoint*')) for x in models]
# models = [x for l in models for x in l]
models += ['results/huggyllama:llama-7b_human_mix-trainer_savebystep/checkpoint-1400']

info = {}  
cmds = []
for model_name_or_path in models:
    run_id = model_name_or_path
    save_dir = f'{model_name_or_path}/eval/{task_name}'
    if use_chat_format:
        save_dir += '_chatfmt'
    
    if task_name == 'mmlu':
        cmd = f"""
        python -m eval.mmlu.run_eval \
            --data_dir data/eval/mmlu \
            --save_dir {model_name_or_path}/eval/mmlu/ \
            --model_name_or_path {model_name_or_path} \
            --eval_batch_size {batch_size} \
            --ntrain 0 \
            {'--use_chat_format' if use_chat_format else ''}
        """
    if task_name == 'gsm':
        cmd = f"""
        python -m eval.gsm.run_eval \
            --data_dir data/eval/gsm/ \
            --model_name_or_path {model_name_or_path} \
            --save_dir {save_dir} \
            --eval_batch_size {batch_size} \
            --n_shot 8 \
            {'--use_chat_format' if use_chat_format else ''}
        """
    if task_name.startswith('bbh'):
        cmd = f"""
        python -m eval.bbh.run_eval \
            --data_dir data/eval/bbh/ \
            --model_name_or_path {model_name_or_path} \
            --save_dir {save_dir} \
            --eval_batch_size {batch_size} \
            {'--no_cot' if 's=0' in task_name else ''}
            {'--use_chat_format' if use_chat_format else ''}
        """
        
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    print(cmd)
    
    # submit
    shell_scripts = shell_scripts_template.format(
        cmd=cmd,
        log_dir=os.getcwd(),
        save_dir=save_dir)
    out = submit_job_ccc(
        shell_scripts, 
        job_name=job_name, 
        queue=queue,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=1,
        test_run=test_run,
    )
#     if test_run: print(out['jbsub_cmd'])
    
    

python -m eval.mmlu.run_eval --data_dir data/eval/mmlu --save_dir results/baselines/huggyllama/llama-7b/eval/mmlu/ --model_name_or_path results/baselines/huggyllama/llama-7b --eval_batch_size 10 --ntrain 0
python -m eval.mmlu.run_eval --data_dir data/eval/mmlu --save_dir results/huggyllama:llama-7b_human_mix-trainer_savebystep/checkpoint-1400/eval/mmlu/ --model_name_or_path results/huggyllama:llama-7b_human_mix-trainer_savebystep/checkpoint-1400 --eval_batch_size 10 --ntrain 0


In [120]:
class EvalResults:
    
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.eval_dir = os.path.join(self.save_dir, 'eval')
        
    def get_result_df(self):

        task_names = os.listdir(self.eval_dir)

        dfs = []
        for task_name in task_names:
            task_save_dir = os.path.join(self.eval_dir, task_name)
            metrics_file = os.path.join(task_save_dir, 'metrics.json')
            if not os.path.exists(metrics_file):
                continue
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)

            if 'mmlu' in task_name:
                for k, v in metrics['subcat_acc'].items():
                    metrics[k] = v
                del metrics['subcat_acc']
                for k, v in metrics['cat_acc'].items():
                    metrics[k] = v
                del metrics['cat_acc']

            index = [(task_name, k) for k in metrics.keys()]
            multi_columns = pd.MultiIndex.from_tuples(index, names=['task_name', 'metrics'])
            data = list(metrics.values())

            df = pd.DataFrame([data], columns=multi_columns)
            dfs.append(df)

        df = pd.concat(dfs, axis=1)
        cols = [
            ('gsm', 'exact_match'),
            ('bbh_s=0', 'average_exact_match'),
            ('mmlu', 'average_acc'),
        ]
        df = df[cols]
        df.insert(0, ('model', ''), [self.save_dir])
        return df


save_dirs = [
    '../results/baselines/huggyllama/llama-7b/',
    '../results/huggyllama:llama-7b_human_mix-trainer_savebystep/checkpoint-1400',
]

dfs = []
for save_dir in save_dirs:
    r = EvalResults(save_dir)
    df = r.get_result_df()
    dfs.append(df)

df = pd.concat(dfs, axis=0)
df


task_name,model,gsm,bbh_s=0,mmlu
metrics,Unnamed: 1_level_1,exact_match,average_exact_match,average_acc
0,../results/baselines/huggyllama/llama-7b/,0.109932,0.316671,0.318687
0,../results/huggyllama:llama-7b_human_mix-trainer_savebystep/checkpoint-1400,0.100834,0.332591,0.333001


In [119]:
pd.set_option('display.max_colwidth', None)


In [35]:
import glob

models = []
models += ['t5-small', 't5-base', 't5-large', 't5-3b', 't5-11b']
models += ['huggyllama/llama-7b']
save_dirs = [f'../results/baselines/{x}/eval/gsm/' for x in models]

data = []
for model, save_dir in zip(models, save_dirs):
    logfile_path = glob.glob(os.path.join(save_dir, '*.out'))[0]
    out = get_run_statistics(logfile_path)
    with open(os.path.join(save_dir, 'metrics.json'), 'r') as f:
        metrics = json.load(f)
    data.append((model, out['cpu_time']/60/60, out['avg_mem'], out['max_mem'], metrics['exact_match']))
    

import pandas as pd
columns = ['name', 'cpu_time (hr)', 'avg_mem', 'max_mem', 'exact_match']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,name,cpu_time (hr),avg_mem,max_mem,exact_match
0,t5-small,0.038869,0.491738,0.597656,0.014405
1,t5-base,0.071833,0.729512,0.787109,0.018196
2,t5-large,0.021314,0.957783,1.317383,0.0
3,t5-3b,0.208883,6.68915,11.693359,0.013647
4,t5-11b,0.458394,0.872705,33.018555,0.009098
5,huggyllama/llama-7b,0.7904,0.639141,0.710938,0.109932


{'exact_match': 0.009097801364670205}