In [1]:
!nvidia-smi

Tue Jul 11 21:17:27 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB           On | 00000000:44:00.0 Off |                    0 |
| N/A   24C    P0               60W / 400W|      0MiB / 81920MiB |      0%   E. Process |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [3]:
import os
from rosemary import jpt_setup; jpt_setup()

from llm.submit import multiline_to_singleline, submit_job_ccc, get_run_statistics
import pandas as pd

  warn(f'Install `torch` for functionalities dependent on torch')


In [4]:

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem usage (GB)', 'per-epoch time (hr)', 'per-iter time (s)']

# 55k data points, batch_size=128
data_oasst1 = [
    ('gpt2', 0.124, 'bf16', 'no', 10, None, None),
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 2.5, 11),
    # incorporate deep speed is costly!
    ('gpt2-Large', 0.774, 'bf16', 'stage 3 no offloading', 40, 6, 25),
    # 1 a100_40g: without offloading OOM on `.backward()`, runs fine with offloading.
    ('gpt2-xl', 1.5, 'bf16', 'stage 3 with offloading', 40, 13, 55),
    # 4 v100_32g: without offloading.
]

df_oasst1 = pd.DataFrame(data_oasst1, columns=cols)

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem (GB)', 'cpu mem (GB)', 'per-epoch time (hr)', 'per-iter time (s)']
data = [
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 7, 9.5, 9),
]

print('instruction tune human-mix on 1 a100_40g:')
df = pd.DataFrame(data, columns=cols)
df

instruction tune human-mix on 1 a100_40g:


Unnamed: 0,model,size,mixed-precision,deepspeed,gpu mem (GB),cpu mem (GB),per-epoch time (hr),per-iter time (s)
0,gpt2-Large,0.774,bf16,no,36,7,9.5,9


# Finetuning with openinstruct/finetune.py


In [17]:
shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}
"""

# [ ! -f "{log_dir}/${{LSB_JOBID}}.out" ] || mv "{log_dir}/${{LSB_JOBID}}.out" "{save_dir}"

In [16]:
job_name = 'finetune'
test_run = False

queue = 'x86_1h' # 'x86_12h'
num_cpus = 20
cpu_mem = 32
require = 'a100_80gb'

# model_name_or_path = 'mosaicml/mpt-7b'; max_seq_length = 2048
# model_name_or_path = 'gpt2'; max_seq_length = 1024
# model_name_or_path = 'gpt2-Large'; max_seq_length = 1024
# model_name_or_path = 'gpt2-xl'; max_seq_length = 1024
model_name_or_path = 'huggyllama/llama-7b'; max_seq_length = 2048


train_file = 'data/processed/oasst1/oasst1_data.jsonl'; train_file_short = 'oasst1'
train_file = 'data/processed/flanv2_cot_oasst1_dolly.jsonl'; train_file_short = 'human_mix'
# train_file = 'data/processed/flanv2_cot_oasst1_dolly_shuffled.jsonl'; train_file_short = 'human_mix_shuffled'

output_dir = f"results/{model_name_or_path.replace('/', ':')}_{train_file_short}"

use_deepspeed = False
# deepspeed_config_file = 'ds_configs/stage3_no_offloading_accelerate_setauto.conf'
# deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate.conf'
deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate_setauto.conf'

use_lora = True
lora_rank = 4
lora_alpha = lora_rank
lora_dropout = 0.05

num_gpus = 1
batch_size_per_gpu = 1
total_batch_size = 128

gradient_acc_steps = int(total_batch_size/num_gpus/batch_size_per_gpu)

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{batch_size_per_gpu} batch size per GPU, "
      f"{gradient_acc_steps} gradient accumulation steps.")

# do use fast tokenizer since mpt-7b does not have a fast tokenizer counter-part
#     --use_slow_tokenizer \
# do not use flash attention, since having problem installing flash-attn with cuda 12.1
#     --use_flash_attn \

cmd = f"""
{'!cd .. && ' if test_run else ''}accelerate launch \
    --mixed_precision bf16 \
    --num_machines 1 \
    --num_processes {num_gpus} \
    {'--use_deepspeed' if use_deepspeed else ''}
    {'--deepspeed_config_file '+deepspeed_config_file if use_deepspeed else ''}
    open_instruct/finetune.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    {'--use_lora' if use_lora else ''}
    --lora_rank {lora_rank} \
    --lora_alpha {lora_alpha} \
    --lora_dropout {lora_dropout} \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size {batch_size_per_gpu} \
    --gradient_accumulation_steps {gradient_acc_steps} \
    --learning_rate 2e-5 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.03 \
    --weight_decay 0. \
    --num_train_epochs 2 \
    --output_dir {output_dir} \
    --with_tracking \
    --report_to tensorboard \
    --logging_steps 1
"""

# things to test to see its effects on (1) eval perf (2) runtime.
#
# - --mixed_precision bf16. 
# - with/without LoRA
# - LoRA's rank/alpha (alpha typically set to 2*rank)
# - batch size
# - micro-batch size (largest without running out of memory)


cmd = multiline_to_singleline(cmd)
if test_run:
    print()
    print(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job_ccc(
    shell_scripts, 
    job_name=job_name, 
    queue=queue,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    require=require,
    num_gpus=1,
    test_run=test_run,
)
if not test_run:
    print(out)

Training huggyllama/llama-7b using 1 GPUs, 1 batch size per GPU, 128 gradient accumulation steps.


In [12]:
# llama7b+lora: 66gb gpu mem, in it 28gb for torch tensor. so fit on 1 a100_80gb
# 


!cd .. && accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 1 open_instruct/finetune.py --model_name_or_path huggyllama/llama-7b --tokenizer_name huggyllama/llama-7b --train_file data/processed/flanv2_cot_oasst1_dolly.jsonl --max_seq_length 2048 --use_lora --lora_rank 4 --lora_alpha 4 --lora_dropout 0.05 --preprocessing_num_workers 16 --per_device_train_batch_size 1 --gradient_accumulation_steps 128 --learning_rate 2e-5 --lr_scheduler_type linear --warmup_ratio 0.03 --weight_decay 0. --num_train_epochs 2 --output_dir results/huggyllama:llama-7b_human_mix --with_tracking --report_to tensorboard --logging_steps 1


The following values were not passed to `accelerate launch` and had defaults used instead:
	`--mixed_precision` was set to a value of `'no'`
	`--dynamo_backend` was set to a value of `'no'`

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...
07/11/2023 22:28:51 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda

Mixed precision type: no

100%|█

loading file tokenizer.model from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer.model
loading file tokenizer.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/special_tokens_map.json
loading file tokenizer_config.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/tokenizer_config.json
loading weights file model.safetensors from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--huggyllama--llama-7b/snapshots/8416d3fefb0cb3ff5775a7b13c1692d10ff1aa16/model.safetensors.index.json
Instantiating LlamaForCausalLM model unde

07/11/2023 22:30:59 - INFO - __main__ - ***** Running training *****
07/11/2023 22:30:59 - INFO - __main__ -   Total train batch size (w. parallel, distributed & accumulation) =  Num examples = 270152
07/11/2023 22:30:59 - INFO - __main__ -   Num Epochs = 2
07/11/2023 22:30:59 - INFO - __main__ -   Instantaneous batch size per device = 1
07/11/2023 22:30:59 - INFO - __main__ -  128
07/11/2023 22:30:59 - INFO - __main__ -   Gradient Accumulation steps = 128
07/11/2023 22:30:59 - INFO - __main__ -   Total optimization steps = 4222
  0%|                                                  | 0/4222 [00:00<?, ?it/s]before train loop:
GPU memory occupied: 14736 MB.
model.device: cuda:0
model.dtype: torch.bfloat16
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
0  batch.input_ids:  torch.Size([1, 182])
GPU memory occupi

44  batch.input_ids:  torch.Size([1, 103])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13624003072
model.device: cuda:0
model.dtype: torch.bfloat16
45  batch.input_ids:  torch.Size([1, 539])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13584744448
model.device: cuda:0
model.dtype: torch.bfloat16
46  batch.input_ids:  torch.Size([1, 513])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13612301824
model.device: cuda:0
model.dtype: torch.bfloat16
47  batch.input_ids:  torch.Size([1, 385])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13610634752
model.device: cuda:0
model.dtype: torch.bfloat16
48  batch.input_ids:  torch.Size([1, 259])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13602439168
model.device: cuda:0
model.dtype: torch.bfloat16
49  batch.input_ids:  torch.Size([1, 261])
GPU memory occupied: 66158 MB.
torch.cuda.memory_allocated():  13594375168
model.device: cuda:0
model.dtype: torch.bfloat1

GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13636419584
model.device: cuda:0
model.dtype: torch.bfloat16
94  batch.input_ids:  torch.Size([1, 106])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13583490048
model.device: cuda:0
model.dtype: torch.bfloat16
95  batch.input_ids:  torch.Size([1, 305])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13584582656
model.device: cuda:0
model.dtype: torch.bfloat16
96  batch.input_ids:  torch.Size([1, 205])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13597317632
model.device: cuda:0
model.dtype: torch.bfloat16
97  batch.input_ids:  torch.Size([1, 176])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13590915584
model.device: cuda:0
model.dtype: torch.bfloat16
98  batch.input_ids:  torch.Size([1, 157])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13589059584
model.device: cuda:0
model.dtype: torch.bfloat16
99  batch.input_ids:  torch.Size([1, 143]

141  batch.input_ids:  torch.Size([1, 88])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13613131776
model.device: cuda:0
model.dtype: torch.bfloat16
142  batch.input_ids:  torch.Size([1, 122])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13600203264
model.device: cuda:0
model.dtype: torch.bfloat16
143  batch.input_ids:  torch.Size([1, 120])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13602379264
model.device: cuda:0
model.dtype: torch.bfloat16
144  batch.input_ids:  torch.Size([1, 538])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13602262016
model.device: cuda:0
model.dtype: torch.bfloat16
145  batch.input_ids:  torch.Size([1, 398])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13629011968
model.device: cuda:0
model.dtype: torch.bfloat16
146  batch.input_ids:  torch.Size([1, 383])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13620049920
model.device: cuda:0
model.dtype: torch.bf

190  batch.input_ids:  torch.Size([1, 544])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13600854016
model.device: cuda:0
model.dtype: torch.bfloat16
191  batch.input_ids:  torch.Size([1, 31])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13630089728
model.device: cuda:0
model.dtype: torch.bfloat16
192  batch.input_ids:  torch.Size([1, 139])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13596661248
model.device: cuda:0
model.dtype: torch.bfloat16
193  batch.input_ids:  torch.Size([1, 493])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13603476480
model.device: cuda:0
model.dtype: torch.bfloat16
194  batch.input_ids:  torch.Size([1, 316])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13626128384
model.device: cuda:0
model.dtype: torch.bfloat16
195  batch.input_ids:  torch.Size([1, 95])
GPU memory occupied: 66218 MB.
torch.cuda.memory_allocated():  13615542272
model.device: cuda:0
model.dtype: torch.bfl

239  batch.input_ids:  torch.Size([1, 1041])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13647587840
model.device: cuda:0
model.dtype: torch.bfloat16
240  batch.input_ids:  torch.Size([1, 181])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13661198848
model.device: cuda:0
model.dtype: torch.bfloat16
241  batch.input_ids:  torch.Size([1, 145])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13606156800
model.device: cuda:0
model.dtype: torch.bfloat16
242  batch.input_ids:  torch.Size([1, 284])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13604504064
model.device: cuda:0
model.dtype: torch.bfloat16
243  batch.input_ids:  torch.Size([1, 1439])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13612780032
model.device: cuda:0
model.dtype: torch.bfloat16
244  batch.input_ids:  torch.Size([1, 104])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13686709248
model.device: cuda:0
model.dtype: torch

287  batch.input_ids:  torch.Size([1, 126])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13671407616
model.device: cuda:0
model.dtype: torch.bfloat16
288  batch.input_ids:  torch.Size([1, 158])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13602636800
model.device: cuda:0
model.dtype: torch.bfloat16
289  batch.input_ids:  torch.Size([1, 153])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13604684800
model.device: cuda:0
model.dtype: torch.bfloat16
290  batch.input_ids:  torch.Size([1, 1698])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13604401664
model.device: cuda:0
model.dtype: torch.bfloat16
291  batch.input_ids:  torch.Size([1, 1000])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13703267840
model.device: cuda:0
model.dtype: torch.bfloat16
292  batch.input_ids:  torch.Size([1, 87])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13658572800
model.device: cuda:0
model.dtype: torch.

336  batch.input_ids:  torch.Size([1, 101])
GPU memory occupied: 66226 MB.
torch.cuda.memory_allocated():  13603851264
model.device: cuda:0
model.dtype: torch.bfloat16
^C
Traceback (most recent call last):
  File "/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/open_instruct/finetune.py", line 694, in <module>
    main()
  File "/dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/open_instruct/finetune.py", line 628, in main
    accelerator.backward(loss)
  File "/dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/accelerate/accelerator.py", line 1821, in backward
    loss.backward(**kwargs)
  File "/dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/torch/_tensor.py", line 487, in backward
    torch.autograd.backward(
  File "/dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/torch/autograd/__init__.py", line 200, in backward
    Variable._execution_engine.run_backward(  # Calls in

# eval



In [106]:
shell_scripts_template = """
export OPENAI_API_KEY=$(cat ~/.openai_api_key)
export HF_HOME="/dccstor/mit_fm/wpq/hf_cache/"

source /dccstor/mit_fm/miniconda/bin/activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

echo "Running on $(hostname)"
echo "======"
echo "{cmd}"
echo "======"

{cmd}

[ ! -f "{log_dir}/${{LSB_JOBID}}.out" ] || mv "{log_dir}/${{LSB_JOBID}}.out" "{save_dir}"
"""

In [116]:
if any(x in model_name_or_path for x in ['small', 'base', 'medium', 'large']):
    cpu_mem = 2
elif any(x in model_name_or_path for x in ['3b']):
    cpu_mem = 15
elif any(x in model_name_or_path for x in ['7b', '11b', 'xl', 'xxl']):
    cpu_mem = 64
    

True

In [117]:
job_name = 'eval.gsm'
test_run = False
queue = 'x86_1h'
num_cpus = 10
cpu_mem = 64

models = []
# models += ['t5-small', 't5-base', 't5-large', 't5-3b', 't5-11b']
# models += ['t5-11b']
# models += ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', 'google/flan-t5-xxl']
# models += ['gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl']
models += ['huggyllama/llama-7b'] # , 'mosaicml/mpt-7b'

models = [os.path.join('results/baselines', x) for x in models]

info = {}
cmds = []
for model_name_or_path in models:
    run_id = model_name_or_path
    save_dir = f'{model_name_or_path}/eval/gsm'
    
    cmd = f"""
    python -m eval.gsm.run_eval \
        --data_dir data/eval/gsm/ \
        --model_name_or_path {model_name_or_path} \
        --save_dir {save_dir} \
        --eval_batch_size 5 \
        --n_shot 8
    """
    cmd = multiline_to_singleline(cmd)
    cmds.append(cmd)
    print(cmd)
    
    # submit
    shell_scripts = shell_scripts_template.format(
        cmd=cmd,
        log_dir=os.getcwd(),
        save_dir=save_dir)
    out = submit_job_ccc(
        shell_scripts, 
        job_name=job_name, 
        queue=queue,
        num_cpus=num_cpus,
        cpu_mem=cpu_mem,
        num_gpus=1,
        test_run=test_run,
    )
#     if test_run: print(out['jbsub_cmd'])
    if not test_run:
        info[model_name_or_path] = out['job_id']
    

python -m eval.gsm.run_eval --data_dir data/eval/gsm/ --model_name_or_path results/baselines/huggyllama/llama-7b --save_dir results/baselines/huggyllama/llama-7b/eval/gsm --eval_batch_size 5 --n_shot 8


In [110]:
info = {}
info['results/baselines/t5-small'] = 1763441
info['results/baselines/t5-base'] = 1763442
info['results/baselines/t5-large'] = 1763443
info['results/baselines/t5-3b'] = 1764783
info['results/baselines/t5-11b'] = 1763445

In [111]:


data = []
for k, job_id in info.items():
    logfile_path = f'{job_id}.out'
    out = get_run_statistics(logfile_path)
    data.append((k.split('/')[-1], job_id, out['cpu_time'], out['avg_mem'], out['max_mem']))
    
import pandas as pd
df = pd.DataFrame(data, columns=['name', 'job_id', 'cpu_time', 'avg_mem', 'max_mem'])
df


Unnamed: 0,name,job_id,cpu_time,avg_mem,max_mem
0,t5-small,1763441,139.93,0.491738,0.597656
1,t5-base,1763442,258.6,0.729512,0.787109
2,t5-large,1763443,76.73,0.957783,1.317383
3,t5-3b,1764783,751.98,6.68915,11.693359
4,t5-11b,1763445,8.53,5.601807,10.0
