In [1]:
import os
from rosemary import jpt_setup; jpt_setup()

from transformers.trainer_utils import get_last_checkpoint

import re
from llm.submit import (
    multiline_to_singleline,
    submit_job_ccc,
    submit_job_aimos,
    submit_job,
    get_run_statistics)
import pandas as pd
import json
import platform
import tempfile
import subprocess
import shlex
import datetime
import itertools
import socket

arch = platform.uname().processor
hostname = socket.gethostname()
cluster = 'ccc' if hostname.startswith('ccc') else ('dcs' if hostname.startswith('dcs') else 'npl')
arch, cluster

  warn(f'Install `torch` for functionalities dependent on torch')


('ppc64le', 'dcs')

In [2]:

shell_scripts_template_slurm = """
echo "Running on $SLURM_JOB_NODELIST"
echo "======"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}
"""

shell_scripts_template_lsf = """
echo "Running on $LSB_DJOB_HOSTFILE"
echo "======"

master_addr=$(head -n 1 "$LSB_DJOB_HOSTFILE")
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/open-instruct/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$LSB_JOBID*.out" ] && mv {log_dir}/$LSB_JOBID*.out {save_dir}
"""

shell_scripts_template = shell_scripts_template_slurm \
    if arch == 'ppc64le' else shell_scripts_template_lsf

print(shell_scripts_template)


echo "Running on $SLURM_JOB_NODELIST"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/open-instruct/

set -e
set -x
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}



In [9]:
package_dir = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi"
cache_dir = '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache'
preprocessed_data = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed"

envs = {
    "CACHE": cache_dir,
    "DOREMI_DIR": package_dir,
    "PILE_DIR": os.path.join(package_dir, "data", 'raw'),
    "PREPROCESSED_PILE_DIR": preprocessed_data,
    "MODEL_OUTPUT_DIR": os.path.join(package_dir, 'results'),
    "PARTITION": "el8",
    "HF_HOME": cache_dir,
    "TRANSFORMERS_CACHE": cache_dir,
    "HF_DATASETS_CACHE": cache_dir,
    "HF_DATASETS_IN_MEMORY_MAX_SIZE": "0",
    "TORCH_EXTENSIONS_DIR": cache_dir,
    "TMPDIR": cache_dir,
    "WANDB_DIR": os.path.join(cache_dir, "wandb"),
    "WANDB_MODE": 'offline',
    "PREPROCESSED_DATA": preprocessed_data,
    'PREPROCESSED_CACHE': os.path.join(cache_dir, 'preprocessed_cache', 'perdomain_pile_preprocessed'),

}

for k, v in envs.items():
    os.environ[k] = v
    
os.makedirs(cache_dir, exist_ok=True)
print(';'.join([f'export {k}={v}' for k, v in envs.items()]))

export CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export DOREMI_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi;export PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/raw;export PREPROCESSED_PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed;export MODEL_OUTPUT_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results;export PARTITION=el8;export HF_HOME=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export TRANSFORMERS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export HF_DATASETS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export HF_DATASETS_IN_MEMORY_MAX_SIZE=0;export TORCH_EXTENSIONS_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export TMPDIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache;export WANDB_DIR=/gpfs/

In [4]:
# populate domain weight config
import json
domain_config_path = os.path.abspath('../configs/humanmix_baseline_50kvocab.json')
domain_weights = {"cot": .25, "flan_v2": .25, "dolly": .25, "oasst1": .25}
domain_weights = {'cot': 0.5, 'flan_v2': 0.25, 'dolly': 0.12, 'oasst1': 0.13}
abbr_train_file = 'humanmix'

domain_config = {"train_domain_weights": domain_weights, "eval_domain_weights": domain_weights}
with open(domain_config_path, 'w') as f:
    json.dump(domain_config, f)


In [22]:
# train gpt2 base model
job_name = 'ft1'

test_run = 1
test_run = bool(test_run)
job_duration = 6

nodes = 1 # 128/(5*6*2)~=2.1
num_cpus, cpu_mem = (144, 512) if arch == 'ppc64le' else (32, 64)
num_gpus = 6; gpu_type = 'v100'
# num_gpus = 1; gpu_type = 'v100'

use_doremi = True
overwrite_output_dir = True if test_run else False

model_name_or_path = ('/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/'
                      'gpt2-medium')
model_type = 'gpt2'; abbr_model_name = 'gpt2-medium'

cache_dir = envs['CACHE']
domain_config_path = os.path.abspath('../configs/humanmix_baseline_50kvocab.json')
dataset_dir = preprocessed_data


total_batch_size = 128 # # 64*8=512
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
gradient_accumulation_steps = int(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)
max_steps = int(200000/total_batch_size); save_steps = 10 # 200k steps.
gradient_checkpointing = False

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{per_device_train_batch_size} batch size per GPU, "
      f"{gradient_accumulation_steps} gradient accumulation steps,"
      f"for {max_steps} max steps.")


if nodes == 1:
    exe = 'python' if num_gpus==1 else \
        f"""torchrun \
        --nproc_per_node={num_gpus} \
        --master_port=10002"""
else:
    exe = f"""torchrun \
              --nnodes={nodes} \
              --nproc_per_node={num_gpus} \
              --rdzv-id=$SLURM_JOB_ID \
              --rdzv-backend=c10d \
              --rdzv-endpoint=$RDZV_ENDPOINT"""
if test_run:
    exe = f"CUDA_VISIBLE_DEVICES={','.join(map(str, range(num_gpus)))} {exe}"
    
# use `dataset_dir` instead of `dataset_name` to specify `preprocessed_dir`
# --dataset_name=pile \

## learning rate for pretraining, substituted with finetuning hyperparameters
# --learning_rate 1e-3 \
# --lr_end 1e-4 \
# --adam_epsilon 1e-8 \

## don't need cosine scheduling for finetuning
# --weight_decay 0.01 \
# --lr_scheduler_name linear_warmup_cosine \
# --warmup_ratio 0.06 \

## avoids grad scaling error
# --fp16 \
## for training model from scratch
# --config_overrides="n_positions=1024,n_embd=1024,n_layer=18,n_head=16" \

## added the following
# add_domain_id: for non-pile preprocessed dataset
# do_padding: true for variable size sequences, as in instruction tuning datasets.
# --max_train_samples 1000 \



output_dirname = f'{abbr_model_name}_{abbr_train_file}'
if use_doremi:
    output_dirname += '_doremi'
else:
    output_dirname += '_baseline'
if test_run:
    output_dirname = 'jpt_'+output_dirname
run_name = os.path.join(job_name, output_dirname)
output_dir = os.path.join(envs['MODEL_OUTPUT_DIR'], run_name)


if use_doremi:
    reference_model_name_or_path = (
        '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/'
        'ft1/jpt_gpt2-medium_humanmix_baseline/checkpoint-10')
    doremi_options = f"""
        --doremi_optimizer=doremiv1 \
        --reweight_eta=1 \
        --reweight_eps=1e-4 \
        --train_domain_weights_tmp_file={os.path.join(output_dir, 'domain_weights')} \
        --reweight_domains \
        --remove_unused_columns=False \
        --reference_model_name_or_path={reference_model_name_or_path} \
    """
else:
    doremi_options = ''

cmd = f"""
    {'!cd .. && ' if test_run else ''}{exe}
    doremi/train.py \
    --model_name_or_path={model_name_or_path} \
    --model_type={model_type} \
    --do_train \
    --cache_dir={cache_dir} \
    --dataset_dir={dataset_dir} \
    --domain_config_path={domain_config_path} \
    --max_token_length=1024 \
    --per_device_train_batch_size={per_device_train_batch_size} \
    --gradient_accumulation_steps={gradient_accumulation_steps} \
    --dataloader_num_workers=1 \
    --learning_rate=2e-5 \
    --lr_scheduler_type=linear \
    --warmup_ratio=0.03 \
    --weight_decay=0. \
    --max_grad_norm=1.0 \
    --max_steps={max_steps} \
    --evaluation_strategy=no \
    --save_strategy=steps \
    --save_steps={save_steps} \
    --save_total_limit=1 \
    --run_name={run_name} \
    --seed=1111 \
    --logging_strategy=steps \
    --logging_steps=10 \
    --logging_first_step \
    --report_to=all \
    --optim=adamw_hf \
    --adam_beta1=0.9 \
    --adam_beta2=0.99 \
    {'--gradient_checkpointing' if gradient_checkpointing  else ''} \
    {'--overwrite_output_dir' if overwrite_output_dir else ''} \
    --torch_dtype=float32 \
    --add_domain_id=True \
    --do_padding=True \
    --fp16 \
    {doremi_options if doremi_options else ''} \
    --output_dir={output_dir} \
"""

if test_run:
    cmds = [x.strip() for x in cmd.split('    ') if x.strip()]
    cmds = ['    '+x if x.startswith('--') else x for x in cmds]
    print()
    print(' \\\n'.join(cmds))
cmd = multiline_to_singleline(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job(
    shell_scripts, 
    job_name=job_name, 
    nodes=nodes,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    num_gpus=num_gpus,
    gpu_type=gpu_type,
    test_run=test_run,
    job_duration=job_duration,
)
if not test_run:
    print(out)

Training /gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/gpt2-medium using 6 GPUs, 1 batch size per GPU, 21 gradient accumulation steps,for 1562 max steps.

!cd .. && CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun \
    --nproc_per_node=6 \
    --master_port=10002 \
doremi/train.py \
    --model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/gpt2-medium \
    --model_type=gpt2 \
    --do_train \
    --cache_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache \
    --dataset_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed \
    --domain_config_path=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/configs/humanmix_baseline_50kvocab.json \
    --max_token_length=1024 \
    --per_device_train_batch_size=1 \
    --gradient_accumulation_steps=21 \
    --dataloader_num_workers=1 \
    --learning_rate=2e-5 \
    --lr_schedu

In [21]:
!cd .. && CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 torchrun \
    --nproc_per_node=6 \
    --master_port=10002 \
doremi/train.py \
    --model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/gpt2-medium \
    --model_type=gpt2 \
    --do_train \
    --cache_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache \
    --dataset_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed \
    --domain_config_path=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/configs/humanmix_baseline_50kvocab.json \
    --max_token_length=1024 \
    --per_device_train_batch_size=1 \
    --gradient_accumulation_steps=21 \
    --dataloader_num_workers=1 \
    --learning_rate=2e-5 \
    --lr_scheduler_type=linear \
    --warmup_ratio=0.03 \
    --weight_decay=0. \
    --max_grad_norm=1.0 \
    --max_steps=1562 \
    --evaluation_strategy=no \
    --save_strategy=steps \
    --save_steps=10 \
    --save_total_limit=1 \
    --run_name=ft1/jpt_gpt2-medium_humanmix_doremi \
    --seed=1111 \
    --logging_strategy=steps \
    --logging_steps=10 \
    --logging_first_step \
    --report_to=all \
    --optim=adamw_hf \
    --adam_beta1=0.9 \
    --adam_beta2=0.99 \
    --overwrite_output_dir \
    --torch_dtype=float32 \
    --add_domain_id=True \
    --do_padding=True \
    --fp16 \
    --doremi_optimizer=doremiv1 \
    --reweight_eta=1 \
    --reweight_eps=1e-4 \
    --train_domain_weights_tmp_file=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/ft1/jpt_gpt2-medium_humanmix_doremi/domain_weights \
    --reweight_domains \
    --remove_unused_columns=False \
    --reference_model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/ft1/jpt_gpt2-medium_humanmix_baseline/checkpoint-10 \
    --output_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/ft1/jpt_gpt2-medium_humanmix_doremi

*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
[2023-08-02 17:02:34,552] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-08-02 17:02:34,600] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-08-02 17:02:34,618] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-08-02 17:02:34,628] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-08-02 17:02:34,634] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2023-08-02 17:02:34,636] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda 

[INFO|modeling_utils.py:2604] 2023-08-02 17:02:42,465 >> loading weights file /gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/gpt2-medium/model.safetensors
[INFO|modeling_utils.py:1176] 2023-08-02 17:02:42,501 >> Instantiating GPT2LMHeadModelDoReMi model under default dtype torch.float32.
[INFO|configuration_utils.py:603] 2023-08-02 17:02:42,501 >> Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.32.0.dev0"
}

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 258.13it/s]
[INFO|modeling_utils.py:3333] 2023-08-02 17:02:53,965 >> All model checkpoint weights were used when initializing GPT2LMHeadModelDoReMi.

[INFO|modeling_utils.py:3341] 2023-08-02 17:02:53,965 >> All the weights of GPT2LMHeadModelDoReMi were initialized from the model checkpoint at /gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/result

08/02/2023 17:02:56 - INFO - datasets.builder - Using custom data configuration default-68fd04897f8e942c
08/02/2023 17:02:56 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/json
08/02/2023 17:02:56 - INFO - datasets.builder - Overwrite dataset info from restored data version if exists.
08/02/2023 17:02:56 - INFO - datasets.info - Loading Dataset info from /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-68fd04897f8e942c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
08/02/2023 17:02:56 - INFO - datasets.info - Loading Dataset info from /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-68fd04897f8e942c/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 408.17it/s]
08/02/2023 17:02:56 - INFO - 

08/02/2023 17:02:56 - INFO - datasets.arrow_dataset - Process #0 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7ef14d143e0d41c8_00000_of_00008.arrow
08/02/2023 17:02:56 - INFO - datasets.arrow_dataset - Process #1 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7ef14d143e0d41c8_00001_of_00008.arrow
08/02/2023 17:02:56 - INFO - datasets.arrow_dataset - Process #2 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache/json/default-a01381664fd2589b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-7ef14d143e0d41c8_00002_of_00008.arrow
08/02/2023 17:02:56 - INFO - datasets.arrow_dataset - Process #3 will write at /gpfs/u/scratch/PTFM/PTFMqngp/github/mi

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 421.07it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 417.51it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 419.39it/s]
08/02/2023 17:02:57 - INFO - datasets.builder - Using custom data configuration default-01077b84e742c354
08/02/2023 17:02:57 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/generator
[ERROR|tokenization_utils_base.py:1056] 2023-08-02 17:02:57,032 >> Using pad_token, but it is not set yet.
08/02/2023 17:02:57 - INFO - datasets.builder - Using custom data configuration default-37913fb2903e52c8
08/02/2023 17:02:57 - INFO - datasets.info - Loading Dataset Infos from /gpfs/u/scratch/PTFM/PTFMqngp/miniconda3/envs/open-instruct/lib/python3.10/site-packages/datasets/packaged_modules/generator
08/02/2023 17:02:57 - INFO - datasets.builder 

[INFO|trainer.py:1682] 2023-08-02 17:03:09,113 >> ***** Running training *****
[INFO|trainer.py:1683] 2023-08-02 17:03:09,113 >>   Num examples = 196,812
[INFO|trainer.py:1684] 2023-08-02 17:03:09,113 >>   Num Epochs = 9,223,372,036,854,775,807
[INFO|trainer.py:1685] 2023-08-02 17:03:09,113 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:1688] 2023-08-02 17:03:09,113 >>   Total train batch size (w. parallel, distributed & accumulation) = 126
[INFO|trainer.py:1689] 2023-08-02 17:03:09,113 >>   Gradient Accumulation steps = 21
[INFO|trainer.py:1690] 2023-08-02 17:03:09,113 >>   Total optimization steps = 1,562
[INFO|trainer.py:1691] 2023-08-02 17:03:09,117 >>   Number of trainable parameters = 354,823,168
[INFO|integrations.py:716] 2023-08-02 17:03:09,126 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Tracking run with wandb version 0.15.5
[34m[1mwandb[0m: W&B syncing is set to [1m`offline`[0m in 


KeyboardInterrupt



In [None]:

# shared: v100_32gb gpu 
#
# train baseline on 200k data
# gpt2, nodes=1, num_gpus=6, micro-bsz=8, no-grad-ckpt, fp32, 1hr
# gpt2-medium, nodes=1, num_gpus=6, micro-bsz=4, no-grad-ckpt, fp32, oom
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp32, 2.5hrs
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp16, 1.5hrs
#
# train doremi on 200k data
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp32, oom
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=1, grad-ckpt, fp32, 4hrs
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=1, grad-ckpt, fp16, 2.5hrs

