In [1]:
import os
from rosemary import jpt_setup; jpt_setup()

from transformers.trainer_utils import get_last_checkpoint

import re
from llm.submit import (
    multiline_to_singleline,
    submit_job_ccc,
    submit_job_aimos,
    submit_job,
    get_run_statistics)
import pandas as pd
import json
import platform
import tempfile
import subprocess
import shlex
import datetime
import itertools
import socket

arch = platform.uname().processor
hostname = socket.gethostname()
cluster = 'ccc' if hostname.startswith('ccc') else ('dcs' if hostname.startswith('dcs') else 'npl')
arch, cluster

  warn(f'Install `torch` for functionalities dependent on torch')


('ppc64le', 'dcs')

In [2]:

shell_scripts_template_slurm = """
echo "Running on $SLURM_JOB_NODELIST"
echo "======"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}
"""

shell_scripts_template_lsf = """
echo "Running on $LSB_DJOB_HOSTFILE"
echo "======"

master_addr=$(head -n 1 "$LSB_DJOB_HOSTFILE")
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/doremi/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$LSB_JOBID*.out" ] && mv {log_dir}/$LSB_JOBID*.out {save_dir}
"""

shell_scripts_template = shell_scripts_template_slurm \
    if arch == 'ppc64le' else shell_scripts_template_lsf

print(shell_scripts_template)


echo "Running on $SLURM_JOB_NODELIST"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/

set -e
set -x
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}



In [3]:
## note setting the env in this notebook, then launch jobs 
# will inherit those env variables.

package_dir = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi"
cache_dir = '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache'
preprocessed_data = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed"

envs = {
    "CACHE": cache_dir,
    "DOREMI_DIR": package_dir,
    "PILE_DIR": os.path.join(package_dir, "data", 'raw'),
    "PREPROCESSED_PILE_DIR": preprocessed_data,
    "MODEL_OUTPUT_DIR": os.path.join(package_dir, 'results'),
    "PARTITION": "el8",
    "HF_HOME": cache_dir,
    "TRANSFORMERS_CACHE": cache_dir,
    "HF_DATASETS_CACHE": cache_dir,
    "HF_DATASETS_IN_MEMORY_MAX_SIZE": "0",
    "TORCH_EXTENSIONS_DIR": cache_dir,
    "TMPDIR": cache_dir,
    "WANDB_DIR": os.path.join(cache_dir, "wandb"),
    "WANDB_MODE": 'offline',
    'WANDB_PROJECT': 'huggingface',
    "PREPROCESSED_DATA": preprocessed_data,
    'PREPROCESSED_CACHE': os.path.join(cache_dir, 'preprocessed_cache', 'perdomain_pile_preprocessed'),
}

for k, v in envs.items():
    os.environ[k] = v
    
os.makedirs(cache_dir, exist_ok=True)
print('\n'.join([f'export {k}={v}' for k, v in envs.items()]))

export CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export DOREMI_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi
export PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/raw
export PREPROCESSED_PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed
export MODEL_OUTPUT_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results
export PARTITION=el8
export HF_HOME=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export TRANSFORMERS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export HF_DATASETS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export HF_DATASETS_IN_MEMORY_MAX_SIZE=0
export TORCH_EXTENSIONS_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export TMPDIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export WANDB_DIR=/gpfs/

In [4]:
with open(os.path.join(package_dir, 'constants.sh'), 'w') as f:
    f.writelines('#!/bin/bash\n')
    f.writelines('\n'.join([f'{k}={v}' for k, v in envs.items()]))

In [5]:
import numpy as np

# cot, flanv2, dolly, oasst1
x = np.array([1000000, 1000000, 15011, 34795])
x/x.sum()

array([0.48785105, 0.48785105, 0.00732313, 0.01697478])

In [6]:
# populate domain weight config
import json


for abbr_train_file_mix, domain_weights in [
    ('humanmix_scaled',  {'cot': 0.48785105, 'flan_v2': 0.48785105, 'dolly': 0.00732313, 'oasst1': 0.01697478}),
    ('humanmix_uniform', {"cot": .25, "flan_v2": .25, "dolly": .25, "oasst1": .25})
]:
    domain_config_path = os.path.abspath(f'../configs/{abbr_train_file_mix}_baseline_50kvocab.json')
    domain_config = {"train_domain_weights": domain_weights, "eval_domain_weights": domain_weights}
    with open(domain_config_path, 'w') as f:
        json.dump(domain_config, f)


In [23]:
# train gpt2 base&doremi model
job_name = 'ft1'

# train gpt2 base&doremi model on correct humanmix data
# try to fix error with domain weights -> NaN
job_name = 'drm2'

test_run = 1
test_run = bool(test_run)
job_duration = 6

test_oom = False
report_to = 'all'
num_cpus, cpu_mem = (144, 512) if arch == 'ppc64le' else (32, 64)
nodes = 1; num_gpus = 6; gpu_type = 'v100'
nodes = 1; num_gpus = 1; gpu_type = 'v100'
# nodes = 1; num_gpus = 4; gpu_type = 'v100'


use_doremi = True
doremi_optimizer = 'doremiv1'
doremi_optimizer = 'doremiv2'
overwrite_output_dir = True if test_run else False

hf_models_dir = '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/'
model_name_or_path = os.path.join(hf_models_dir, 'gpt2-medium'); model_type = 'gpt2'; abbr_model_name = 'gpt2-medium'
model_name_or_path = os.path.join(hf_models_dir, 'gpt2-xl'); model_type = 'gpt2'; abbr_model_name = 'gpt2-xl'
model_name_or_path = os.path.join(hf_models_dir, 'EleutherAI/pythia-160m'); model_type = 'pythia'; abbr_model_name = 'pythia-160m'


abbr_train_file = 'humanmix_scaled'
abbr_train_file = 'humanmix_uniform'

cache_dir = envs['CACHE']
domain_config_path = os.path.abspath(f'../configs/{abbr_train_file}_baseline_50kvocab.json')
dataset_dir = preprocessed_data

if not use_doremi:
    per_device_train_batch_size = 2
    gradient_checkpointing = False
else:
    per_device_train_batch_size = 1
    gradient_checkpointing = True

per_device_train_batch_size = 1
gradient_checkpointing = True

total_data_points = 200000
total_batch_size = 128
gradient_accumulation_steps = int(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)
max_steps = int(total_data_points/total_batch_size); save_steps = 100
save_steps = 10

lr = 2e-5

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{per_device_train_batch_size} batch size per GPU, "
      f"{gradient_accumulation_steps} gradient accumulation steps, "
      f"for {max_steps} max steps. "
      f"Effective batch size {per_device_train_batch_size*nodes*num_gpus*gradient_accumulation_steps}")

fsdp = "full_shard auto_wrap"
# fsdp = False
if 'gpt2' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'GPT2Block'
elif 'llama' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'LlamaDecoderLayer'
elif 'mpt' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'MPTBlock'
elif 'pythia' in abbr_model_name: fsdp_transformer_layer_cls_to_wrap = 'GPTNeoXLayer'
else: raise ValueError('Not sure how to set `fsdp_transformer_layer_cls_to_wrap`')
    
    
# if nodes == 1:
if False:
    exe = 'python' if num_gpus==1 else \
        f"""torchrun \
        --nproc_per_node={num_gpus} \
        --master_port=10002"""
else:
    exe = f"""torchrun \
              --nnodes={nodes} \
              --nproc_per_node={num_gpus} \
              --rdzv-id=$SLURM_JOB_ID \
              --rdzv-backend=c10d \
              --rdzv-endpoint=$RDZV_ENDPOINT"""
if test_run:
    exe = f"CUDA_VISIBLE_DEVICES={','.join(map(str, range(num_gpus)))} {exe}"

    
output_dirname = f'{abbr_model_name}_{abbr_train_file}:{int(total_data_points/1000)}k'
if use_doremi:
    output_dirname += f'_{doremi_optimizer}'
else:
    output_dirname += '_baseline'

if test_run:
    run_name = 'jpt_'+output_dirname
else:
    run_name = os.path.join(job_name, output_dirname)
if test_oom:
    run_name += \
        ('_fsdp='+fsdp.split(' ')[0] if fsdp else '')+\
        ('_gradckpt='+str(gradient_checkpointing) if gradient_checkpointing else '')+\
        '_mbsz='+str(per_device_train_batch_size)+\
        '_seqlen='+str(1024)+\
        '_nodes='+str(nodes)
output_dir = os.path.join(envs['MODEL_OUTPUT_DIR'], run_name)

if test_run or test_oom:
    report_to = 'none'


if use_doremi:
    reference_model_name_or_path = \
        '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/'
    if abbr_model_name == 'gpt2-medium':
        if abbr_train_file == 'humanmix_uniform':
            reference_model_name_or_path += 'drm2/gpt2-medium_humanmix_uniform:200k_baseline/'
        elif abbr_train_file == 'humanmix_scaled':
            reference_model_name_or_path += 'drm2/gpt2-medium_humanmix_scaled:200k_baseline/'
    elif abbr_model_name == 'pythia-160m':
        reference_model_name_or_path += 'jpt_pythia-160m_humanmix_uniform:200k_baseline/checkpoint-40'
    else:
        raise ValueError
    doremi_options = f"""
        --doremi_optimizer={doremi_optimizer} \
        --reweight_eta=1 \
        --reweight_eps=1e-4 \
        --train_domain_weights_tmp_file={os.path.join(output_dir, 'domain_weights')} \
        --reweight_domains \
        --remove_unused_columns=False \
        --reference_model_name_or_path={reference_model_name_or_path} \
    """
else:
    doremi_options = ''

cmd = f"""
    {'!cd .. && ' if test_run else ''}{exe}
    doremi/train.py \
    --model_name_or_path={model_name_or_path} \
    --model_type={model_type} \
    --do_train \
    --cache_dir={cache_dir} \
    --dataset_dir={dataset_dir} \
    --domain_config_path={domain_config_path} \
    --max_token_length=1024 \
    --per_device_train_batch_size={per_device_train_batch_size} \
    --gradient_accumulation_steps={gradient_accumulation_steps} \
    --dataloader_num_workers=1 \
    --learning_rate={lr} \
    --lr_scheduler_type=linear \
    --warmup_ratio=0.03 \
    --weight_decay=0. \
    --max_grad_norm=1.0 \
    --max_steps={max_steps} \
    --evaluation_strategy=no \
    --save_strategy=steps \
    --save_steps={save_steps} \
    --save_total_limit=1 \
    --run_name={run_name} \
    --seed=1111 \
    --logging_strategy=steps \
    --logging_steps=10 \
    --logging_first_step \
    --report_to={report_to} \
    --optim=adamw_hf \
    --adam_beta1=0.9 \
    --adam_beta2=0.99 \
    {'--fsdp="'+fsdp+'"' if fsdp else ''} \
    {'--fsdp_transformer_layer_cls_to_wrap="'+fsdp_transformer_layer_cls_to_wrap+'"' 
        if fsdp else ''} \
    {'--gradient_checkpointing' if gradient_checkpointing  else ''} \
    --torch_dtype=float32 \
    --add_domain_id=True \
    --do_padding=True \
    --fp16 \
    {doremi_options if doremi_options else ''} \
    {'--overwrite_output_dir' if overwrite_output_dir else ''} \
    --output_dir={output_dir} \
"""

if test_run:
    cmds = [x.strip() for x in cmd.split('    ') if x.strip()]
    cmds = ['    '+x if x.startswith('--') else x for x in cmds]
    print()
    print(' \\\n'.join(cmds))
cmd = multiline_to_singleline(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job(
    shell_scripts, 
    job_name=job_name, 
    nodes=nodes,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    num_gpus=num_gpus,
    gpu_type=gpu_type,
    test_run=test_run,
    job_duration=job_duration,
)
if not test_run:
    print(out)

Training /gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/EleutherAI/pythia-160m using 1 GPUs, 1 batch size per GPU, 128 gradient accumulation steps, for 1562 max steps. Effective batch size 128

!cd .. && CUDA_VISIBLE_DEVICES=0 torchrun \
    --nnodes=1 \
    --nproc_per_node=1 \
    --rdzv-id=$SLURM_JOB_ID \
    --rdzv-backend=c10d \
    --rdzv-endpoint=$RDZV_ENDPOINT \
doremi/train.py \
    --model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/EleutherAI/pythia-160m \
    --model_type=pythia \
    --do_train \
    --cache_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache \
    --dataset_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed \
    --domain_config_path=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/configs/humanmix_uniform_baseline_50kvocab.json \
    --max_token_length=1024 \
    --per_device_tr

In [None]:

!cd .. && CUDA_VISIBLE_DEVICES=0 torchrun \
    --nnodes=1 \
    --nproc_per_node=1 \
    --rdzv-id=$SLURM_JOB_ID \
    --rdzv-backend=c10d \
    --rdzv-endpoint=$RDZV_ENDPOINT \
doremi/train.py \
    --model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/EleutherAI/pythia-160m \
    --model_type=pythia \
    --do_train \
    --cache_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache \
    --dataset_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed \
    --domain_config_path=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/configs/humanmix_uniform_baseline_50kvocab.json \
    --max_token_length=1024 \
    --per_device_train_batch_size=1 \
    --gradient_accumulation_steps=128 \
    --dataloader_num_workers=1 \
    --learning_rate=2e-05 \
    --lr_scheduler_type=linear \
    --warmup_ratio=0.03 \
    --weight_decay=0. \
    --max_grad_norm=1.0 \
    --max_steps=1562 \
    --evaluation_strategy=no \
    --save_strategy=steps \
    --save_steps=10 \
    --save_total_limit=1 \
    --run_name=jpt_pythia-160m_humanmix_uniform:200k_doremiv2 \
    --seed=1111 \
    --logging_strategy=steps \
    --logging_steps=10 \
    --logging_first_step \
    --report_to=none \
    --optim=adamw_hf \
    --adam_beta1=0.9 \
    --adam_beta2=0.99 \
    --fsdp="full_shard auto_wrap" \
    --fsdp_transformer_layer_cls_to_wrap="GPTNeoXLayer" \
    --gradient_checkpointing \
    --torch_dtype=float32 \
    --add_domain_id=True \
    --do_padding=True \
    --fp16 \
    --doremi_optimizer=doremiv2 \
    --reweight_eta=1 \
    --reweight_eps=1e-4 \
    --train_domain_weights_tmp_file=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/jpt_pythia-160m_humanmix_uniform:200k_doremiv2/domain_weights \
    --reweight_domains \
    --remove_unused_columns=False \
    --reference_model_name_or_path=/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/jpt_pythia-160m_humanmix_uniform:200k_baseline/checkpoint-40 \
    --overwrite_output_dir \
    --output_dir=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results/jpt_pythia-160m_humanmix_uniform:200k_doremiv2


master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.
[2023-08-08 22:56:22,667] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
08/08/2023 22:56:30 - INFO - __main__ - Training/evaluation parameters FullTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.99,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
domain_config_path=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/configs/humanmix_uniform_baseline_50kvocab.json,
doremi_optimizer=doremiv2,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=n


Submiting job with:
{
    "queue": "el8",
    "num_jobs": 1
}


[{'args': 'sbatch --job-name=wpq-job --partition=el8 --nodes=1 --ntasks-per-node=1 --cpus-per-task=1 --mem=3GB --gres=gpu:1 --output=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/scripts/%J.out --time=6:00:00 /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/scripts/tmp1zicrw5p',
  'job_id': 717682}]