In [1]:
!nvidia-smi

Mon Jul 10 16:31:56 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  On   | 00000000:47:00.0 Off |                    0 |
| N/A   23C    P0    49W / 400W |      0MiB / 40960MiB |      0%   E. Process |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
from rosemary import jpt_setup; jpt_setup()

  warn(f'Install `torch` for functionalities dependent on torch')


In [3]:
import pandas as pd

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem usage (GB)', 'per-epoch time (hr)', 'per-iter time (s)']

# 55k data points, batch_size=128
data_oasst1 = [
    ('gpt2', 0.124, 'bf16', 'no', 10, None, None),
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 2.5, 11),
    # incorporate deep speed is costly!
    ('gpt2-Large', 0.774, 'bf16', 'stage 3 no offloading', 40, 6, 25),
    # 1 a100_40g: without offloading OOM on `.backward()`, runs fine with offloading.
    ('gpt2-xl', 1.5, 'bf16', 'stage 3 with offloading', 40, 13, 55),
    # 4 v100_32g: without offloading.
]

df_oasst1 = pd.DataFrame(data_oasst1, columns=cols)

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem (GB)', 'cpu mem (GB)', 'per-epoch time (hr)', 'per-iter time (s)']
data = [
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 7, 9.5, 9),
]

print('instruction tune human-mix on 1 a100_40g:')
df = pd.DataFrame(data, columns=cols)
df

instruction tune human-mix on 1 a100_40g:


Unnamed: 0,model,size,mixed-precision,deepspeed,gpu mem (GB),cpu mem (GB),per-epoch time (hr),per-iter time (s)
0,gpt2-Large,0.774,bf16,no,36,7,9.5,9


# Finetuning with openinstruct/finetune.py


In [None]:
# model_name_or_path = 'mosaicml/mpt-7b'; max_seq_length = 2048
model_name_or_path = 'gpt2'; max_seq_length = 1024
model_name_or_path = 'gpt2-Large'; max_seq_length = 1024
# model_name_or_path = 'gpt2-xl'; max_seq_length = 1024


train_file = 'data/processed/oasst1/oasst1_data.jsonl'; train_file_short = 'oasst1'
train_file = 'data/processed/flanv2_cot_oasst1_dolly.jsonl'; train_file_short = 'human_mix'
train_file = 'data/processed/flanv2_cot_oasst1_dolly_shuffled.jsonl'; train_file_short = 'human_mix_shuffled'

# output_dir = 'results/mpt-7b_oasst1'
output_dir = f"results/{model_name_or_path.split('/')[-1]}_{train_file_short}"

# deepspeed_config_file = 'ds_configs/stage3_no_offloading_accelerate_setauto.conf'
# deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate.conf'
deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate_setauto.conf'

num_gpus = 1
batch_size_per_gpu = 2
total_batch_size = 128
gradient_acc_steps = int(total_batch_size/num_gpus/batch_size_per_gpu)

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{batch_size_per_gpu} batch size per GPU, "
      f"{gradient_acc_steps} gradient accumulation steps.")

# do use fast tokenizer since mpt-7b does not have a fast tokenizer counter-part
#     --use_slow_tokenizer \
# do not use flash attention, since having problem installing flash-attn with cuda 12.1
#     --use_flash_attn \

#     --use_deepspeed \
#     --deepspeed_config_file {deepspeed_config_file} \


cmd = f"""
!cd .. && \
accelerate launch \
    --mixed_precision bf16 \
    --num_machines 1 \
    --num_processes {num_gpus} \
    open_instruct/finetune.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size {batch_size_per_gpu} \
    --gradient_accumulation_steps {gradient_acc_steps} \
    --learning_rate 2e-5 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.03 \
    --weight_decay 0. \
    --num_train_epochs 2 \
    --output_dir {output_dir} \
    --with_tracking \
    --report_to tensorboard \
    --logging_steps 1
"""
print(cmd)


In [None]:
!cd .. && accelerate launch     --mixed_precision bf16     --num_machines 1     --num_processes 1     open_instruct/finetune.py     --model_name_or_path gpt2-Large     --tokenizer_name gpt2-Large     --train_file data/processed/flanv2_cot_oasst1_dolly.jsonl     --max_seq_length 1024     --preprocessing_num_workers 16     --per_device_train_batch_size 2     --gradient_accumulation_steps 64     --learning_rate 2e-5     --lr_scheduler_type linear     --warmup_ratio 0.03     --weight_decay 0.     --num_train_epochs 2     --output_dir results/gpt2-Large_oasst1     --with_tracking     --report_to tensorboard     --logging_steps 1


# eval



In [56]:
from llm.submit import multiline_to_singleline
model_name_or_path = 'results/baselines/mosaicml/mpt-7b'
model_name_or_path = 'results/baselines/gpt2'
prefix = '!cd .. && '
prefix = ''

cmd = f"""
{prefix}python -m eval.gsm.run_eval \
    --data_dir data/eval/gsm/ \
    --max_num_examples 50 \
    --model_name_or_path {model_name_or_path} \
    --save_dir {model_name_or_path}/eval/gsm/ \
    --eval_batch_size 1 \
    --n_shot 8
"""
cmd = multiline_to_singleline(cmd)
cmd

'python -m eval.gsm.run_eval --data_dir data/eval/gsm/ --max_num_examples 50 --model_name_or_path results/baselines/gpt2 --save_dir results/baselines/gpt2/eval/gsm/ --eval_batch_size 1 --n_shot 8'

In [None]:
!cd .. && python -m eval.gsm.run_eval     --data_dir data/eval/gsm/     --max_num_examples 50     --model_name_or_path results/baselines/mosaicml/mpt-7b     --save_dir results/baselines/mosaicml/mpt-7b/eval/gsm/     --eval_batch_size 1     --n_shot 8


In [None]:
# bsz=20 for 20
# (4*60+36)/20 = 13.8
# bsz=10 for 50
# (10*60+49)/50 = 12.98
# bsz=1 for 50
# (12*60+27)/50 = 14.94

# 1300 * 14.94 / 

In [66]:

import subprocess

shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}
"""

submit_kwargs = {
    'job_name': 'eval.gsm',
    'queue': 'x86_6h',
    'num_cpus': 10,
    'cpu_mem': 10,
    'num_gpus': 1,
    'log_dir': '/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/scripts/',
}

models = [
    't5-small',
    't5-base',
    't5-large',
    't5-3b',
    't5-11b',
#     'google/flan-t5-small',
#     'google/flan-t5-base',
#     'google/flan-t5-large',
#     'google/flan-t5-xl',
#     'google/flan-t5-xxl',
#     'gpt2',
#     'gpt2-medium',
#     'gpt2-large',
#     'gpt2-xl',
#     'huggyllama/llama-7b',
#     'mosaicml/mpt-7b',
]
models = [os.path.join('results/baselines', x) for x in models]

cmds = []
for model_name_or_path in models:
    cmd = f"""
    {prefix}python -m eval.gsm.run_eval \
        --data_dir data/eval/gsm/ \
        --model_name_or_path {model_name_or_path} \
        --save_dir {model_name_or_path}/eval/gsm/ \
        --eval_batch_size 10 \
        --n_shot 8
    """
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    

    
for cmd in cmds:
    cmd = shell_scripts_template.format(cmd=cmd)
    
    out = submit_job_ccc(cmd, test=False, **submit_kwargs)
    print(out)
    



In [54]:
from llm.submit import submit_job_ccc

gpu_types = ['v100', 'a100_40gb', 'a100_80gb']
for gpu_type in gpu_types:
    
    out = submit_job_ccc('python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.is_bf16_supported())"',
                  num_gpus=1, require=gpu_type)
    print(out['jbsub_cmd'])

['jbsub', '-queue', 'x86_1h', '-name', 'wpq-job', '-mem', '3g', '-cores', '1x1+1', '-require', 'v100', '-out', '/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/scripts/%J.out', 'bash', '-c', 'python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.is_bf16_supported())"']
['jbsub', '-queue', 'x86_1h', '-name', 'wpq-job', '-mem', '3g', '-cores', '1x1+1', '-require', 'a100_40gb', '-out', '/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/scripts/%J.out', 'bash', '-c', 'python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.is_bf16_supported())"']
['jbsub', '-queue', 'x86_1h', '-name', 'wpq-job', '-mem', '3g', '-cores', '1x1+1', '-require', 'a100_80gb', '-out', '/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/scripts/%J.out', 'bash', '-c', 'python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.is_bf16_supported())"']


In [53]:
' '.join(out['jbsub_cmd'])

'jbsub -queue x86_1h -name wpq-job -mem 3g -cores 1x1+1 -require a100_40g -out /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/scripts/%J.out bash -c python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.is_bf16_supported())"'

In [49]:
shell_scripts = "source ~/.profile\ncd /dccstor/mit_fm/wpq/github\nconda activate wpq-llm\njupyter notebook --no-browser --port=8777 --ip=$(hostname -f)"
submit_job_ccc(shell_scripts, job_name='jpt', queue='x86_6h', num_cpus=20, num_gpus=1, require='a100_40g', log_dir='/dccstor/mit_fm/wpq/github/mitibm2023/scripts/', test=True)

{'jbsub_cmd': ['jbsub',
  '-queue',
  'x86_6h',
  '-name',
  'jpt',
  '-mem',
  '3g',
  '-cores',
  '1x20+1',
  '-require',
  'a100_40g',
  '-out',
  '/dccstor/mit_fm/wpq/github/mitibm2023/scripts/%J.out',
  'bash',
  '-c',
  'source ~/.profile; cd /dccstor/mit_fm/wpq/github; conda activate wpq-llm; jupyter notebook --no-browser --port=8777 --ip=$(hostname -f)']}

In [31]:
shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}
"""
shell_scripts = bash_file_template.format(cmd='ls').strip()
shell_scripts

