In [32]:
import pandas as pd

cols = ['model', 'size', 'mixed-precision', 'deepspeed', 
        'gpu mem usage (GB)', 'per-epoch time (hr)', 'per-iter time (s)']

# 55k data points, batch_size=128
data = [
    ('gpt2', 0.124, 'bf16', 'no', 10, None, None),
    ('gpt2-Large', 0.774, 'bf16', 'no', 36, 2.5, 11),
    # incorporate deep speed is costly!
    ('gpt2-Large', 0.774, 'bf16', 'stage 3 no offloading', 40, 6, 25),
    # 1 a100_40g: without offloading OOM on `.backward()`, runs fine with offloading.
    ('gpt2-xl', 1.5, 'bf16', 'stage 3 with offloading', 40, 13, 55),
]

print('a100_40g:')
print('55k data points, <1k sequence lengths / data point.')

df = pd.DataFrame(data, columns=cols)
df


55k data points, <1k sequence lengths / data point.


Unnamed: 0,model,size,mixed-precision,deepspeed,gpu mem usage (GB),per-epoch time (hr),per-iter time (s)
0,gpt2,0.124,bf16,no,10,,
1,gpt2-Large,0.774,bf16,no,36,2.5,11.0
2,gpt2-Large,0.774,bf16,stage 3 no offloading,40,6.0,25.0
3,gpt2-xl,1.5,bf16,stage 3 with offloading,40,13.0,55.0


In [25]:
# model_name_or_path = 'mosaicml/mpt-7b'; max_seq_length = 2048
model_name_or_path = 'gpt2'; max_seq_length = 1024
model_name_or_path = 'gpt2-Large'; max_seq_length = 1024
# model_name_or_path = 'gpt2-xl'; max_seq_length = 1024


train_file = 'data/processed/oasst1/oasst1_data.jsonl'
# output_dir = 'results/mpt-7b_oasst1'
output_dir = f"results/{model_name_or_path.split('/')[-1]}_oasst1"

# deepspeed_config_file = 'ds_configs/stage3_no_offloading_accelerate_setauto.conf'
# deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate.conf'
deepspeed_config_file = 'ds_configs/stage3_offloading_accelerate_setauto.conf'

num_gpus = 1
batch_size_per_gpu = 2
total_batch_size = 128
gradient_acc_steps = int(total_batch_size/num_gpus/batch_size_per_gpu)

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{batch_size_per_gpu} batch size per GPU, "
      f"{gradient_acc_steps} gradient accumulation steps.")

# do use fast tokenizer since mpt-7b does not have a fast tokenizer counter-part
#     --use_slow_tokenizer \
# do not use flash attention, since having problem installing flash-attn with cuda 12.1
#     --use_flash_attn \

cmd = f"""
!cd .. && \
accelerate launch \
    --mixed_precision bf16 \
    --num_machines 1 \
    --num_processes {num_gpus} \
    --use_deepspeed \
    --deepspeed_config_file {deepspeed_config_file} \
    open_instruct/finetune.py \
    --model_name_or_path {model_name_or_path} \
    --tokenizer_name {model_name_or_path} \
    --train_file {train_file} \
    --max_seq_length {max_seq_length} \
    --preprocessing_num_workers 16 \
    --per_device_train_batch_size {batch_size_per_gpu} \
    --gradient_accumulation_steps {gradient_acc_steps} \
    --learning_rate 2e-5 \
    --lr_scheduler_type linear \
    --warmup_ratio 0.03 \
    --weight_decay 0. \
    --num_train_epochs 2 \
    --output_dir {output_dir} \
    --with_tracking \
    --report_to tensorboard \
    --logging_steps 1
"""
print(cmd)


Training gpt2-Large using 1 GPUs, 2 batch size per GPU, 64 gradient accumulation steps.

!cd .. && accelerate launch     --mixed_precision bf16     --num_machines 1     --num_processes 1     --use_deepspeed     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate_setauto.conf     open_instruct/finetune.py     --model_name_or_path gpt2-Large     --tokenizer_name gpt2-Large     --train_file data/processed/oasst1/oasst1_data.jsonl     --max_seq_length 1024     --preprocessing_num_workers 16     --per_device_train_batch_size 2     --gradient_accumulation_steps 64     --learning_rate 2e-5     --lr_scheduler_type linear     --warmup_ratio 0.03     --weight_decay 0.     --num_train_epochs 2     --output_dir results/gpt2-Large_oasst1     --with_tracking     --report_to tensorboard     --logging_steps 1



In [7]:
from transformers import AutoTokenizer

tokenizer_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True)
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [26]:
!cd .. && accelerate launch     --mixed_precision bf16     --num_machines 1     --num_processes 1     --use_deepspeed     --deepspeed_config_file ds_configs/stage3_no_offloading_accelerate_setauto.conf     open_instruct/finetune.py     --model_name_or_path gpt2-Large     --tokenizer_name gpt2-Large     --train_file data/processed/oasst1/oasst1_data.jsonl     --max_seq_length 1024     --preprocessing_num_workers 16     --per_device_train_batch_size 2     --gradient_accumulation_steps 64     --learning_rate 2e-5     --lr_scheduler_type linear     --warmup_ratio 0.03     --weight_decay 0.     --num_train_epochs 2     --output_dir results/gpt2-Large_oasst1     --with_tracking     --report_to tensorboard     --logging_steps 1


The following values were not passed to `accelerate launch` and had defaults used instead:
	`--dynamo_backend` was set to a value of `'no'`

Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 120
CUDA SETUP: Loading binary /dccstor/mit_fm/miniconda/envs/open-instruct/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda120.so...
[2023-07-06 16:31:42,143] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
07/06/2023 16:31:42 - INFO - __main__ - Distributed environment: DEEPSPEED  Backend: ncc

100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.43s/it]
loading configuration file config.json from cache at /dccstor/mit_fm/wpq/hf_cache/hub/models--gpt2-Large/snapshots/97935fc1a406f447320c3db70fe9e9875dca2595/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-Large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "t

07/06/2023 16:32:22 - INFO - __main__ - Sample 37597 of the training set: {'input_ids': tensor([   27,    91,  7220,    91,    29,   198,   126,   123, 46141,   415,
          418,  9195,  4951,   384,   289,   272,  1171,  4533,  1619,  1960,
          273, 19068,  1081, 44273,    30,   198,    27,    91,   562, 10167,
           91,    29,   198,  4834,  3350,   695,  5733,   285, 40138,   390,
         1679, 50256]), 'labels': tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  4834,  3350,   695,  5733,   285, 40138,   390,
         1679, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}.
07/06/2023 16:32:22 - INFO - accelerate.accelerator - Updating Deep

  0%|                                                   | 0/868 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|                                        | 1/868 [00:46<11:08:38, 46.27s/it]07/06/2023 16:33:11 - INFO - __main__ -   Step: 1, LR: 7.692307692307694e-07, Loss: 2.390869140625
  0%|                                         | 2/868 [01:12<8:19:26, 34.60s/it]07/06/2023 16:33:38 - INFO - __main__ -   Step: 2, LR: 1.5384615384615387e-06, Loss: 2.469482421875
  0%|▏                                        | 3/868 [01:38<7:20:57, 30.59s/it]07/06/2023 16:34:03 - INFO - __main__ -   Step: 3, LR: 2.307692307692308e-06, Loss: 2.6964111328125
  0%|▏                                        | 4/868 [02:04<6:53:40, 28.73s/it]07/06/2023 16:34:29 - INFO - __main__ -   Step: 4, LR: 3.0769230769230774e-

KeyboardInterrupt: 