In [1]:
import os
from rosemary import jpt_setup; jpt_setup()

from transformers.trainer_utils import get_last_checkpoint

import re
from llm.submit import (
    multiline_to_singleline,
    submit_job_ccc,
    submit_job_aimos,
    submit_job,
    get_run_statistics)
import pandas as pd
import json
import platform
import tempfile
import subprocess
import shlex
import datetime
import itertools
import socket

arch = platform.uname().processor
hostname = socket.gethostname()
cluster = 'ccc' if hostname.startswith('ccc') else ('dcs' if hostname.startswith('dcs') else 'npl')
arch, cluster

  warn(f'Install `torch` for functionalities dependent on torch')


('ppc64le', 'dcs')

In [28]:

shell_scripts_template_slurm = """
echo "Running on $SLURM_JOB_NODELIST"
echo "======"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}
"""

shell_scripts_template_lsf = """
echo "Running on $LSB_DJOB_HOSTFILE"
echo "======"

master_addr=$(head -n 1 "$LSB_DJOB_HOSTFILE")
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /dccstor/mit_fm/wpq/github/mitibm2023/external/doremi/

set -e
set -x
echo "======"
srun {cmd}

[ ! -f "{log_dir}/$LSB_JOBID*.out" ] && mv {log_dir}/$LSB_JOBID*.out {save_dir}
"""

shell_scripts_template = shell_scripts_template_slurm \
    if arch == 'ppc64le' else shell_scripts_template_lsf

print(shell_scripts_template)


echo "Running on $SLURM_JOB_NODELIST"

master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
master_port=10002
RDZV_ENDPOINT=$master_addr:$master_port

source ~/.profile
conda activate open-instruct
cd /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/

set -e
set -x
srun {cmd}

[ ! -f "{log_dir}/$SLURM_JOB_ID*.out" ] && mv {log_dir}/$SLURM_JOB_ID*.out {save_dir}



In [44]:
## note setting the env in this notebook, then launch jobs 
# will inherit those env variables.

package_dir = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi"
cache_dir = '/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache'
preprocessed_data = "/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed"

envs = {
    "CACHE": cache_dir,
    "DOREMI_DIR": package_dir,
    "PILE_DIR": os.path.join(package_dir, "data", 'raw'),
    "PREPROCESSED_PILE_DIR": preprocessed_data,
    "MODEL_OUTPUT_DIR": os.path.join(package_dir, 'results'),
    "PARTITION": "el8",
    "HF_HOME": cache_dir,
    "TRANSFORMERS_CACHE": cache_dir,
    "HF_DATASETS_CACHE": cache_dir,
    "HF_DATASETS_IN_MEMORY_MAX_SIZE": "0",
    "TORCH_EXTENSIONS_DIR": cache_dir,
    "TMPDIR": cache_dir,
    "WANDB_DIR": os.path.join(cache_dir, "wandb"),
    "WANDB_MODE": 'offline',
    "PREPROCESSED_DATA": preprocessed_data,
    'PREPROCESSED_CACHE': os.path.join(cache_dir, 'preprocessed_cache', 'perdomain_pile_preprocessed'),
}

for k, v in envs.items():
    os.environ[k] = v
    
os.makedirs(cache_dir, exist_ok=True)
print('\n'.join([f'export {k}={v}' for k, v in envs.items()]))

export CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export DOREMI_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi
export PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/raw
export PREPROCESSED_PILE_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/data/processed
export MODEL_OUTPUT_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/results
export PARTITION=el8
export HF_HOME=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export TRANSFORMERS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export HF_DATASETS_CACHE=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export HF_DATASETS_IN_MEMORY_MAX_SIZE=0
export TORCH_EXTENSIONS_DIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export TMPDIR=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/cache
export WANDB_DIR=/gpfs/

In [47]:
with open(os.path.join(package_dir, 'constants.sh'), 'w') as f:
    f.writelines('#!/bin/bash\n')
    f.writelines('\n'.join([f'{k}={v}' for k, v in envs.items()]))

In [39]:
import numpy as np

# cot, flanv2, dolly, oasst1
x = np.array([1000000, 1000000, 15011, 34795])
x/np.sqrt((x*x).sum())

array([0.70685306, 0.70685306, 0.01061057, 0.02459495])

In [41]:
# populate domain weight config
import json
domain_config_path = os.path.abspath('../configs/humanmix_baseline_50kvocab.json')
domain_weights = {"cot": .25, "flan_v2": .25, "dolly": .25, "oasst1": .25}
domain_weights = {'cot': 0.5, 'flan_v2': 0.25, 'dolly': 0.12, 'oasst1': 0.13}
domain_weights = {'cot': 0.70685306, 'flan_v2': 0.70685306, 'dolly': 0.01061057, 'oasst1': 0.02459495}
abbr_train_file = 'humanmix'

domain_config = {"train_domain_weights": domain_weights, "eval_domain_weights": domain_weights}
with open(domain_config_path, 'w') as f:
    json.dump(domain_config, f)


In [50]:
# train gpt2 base model
job_name = 'ft1'

test_run = 1
test_run = bool(test_run)
job_duration = 6

nodes = 1 # 128/(5*6*2)~=2.1
num_cpus, cpu_mem = (144, 512) if arch == 'ppc64le' else (32, 64)
num_gpus = 6; gpu_type = 'v100'
# num_gpus = 1; gpu_type = 'v100'

use_doremi = False
overwrite_output_dir = True if test_run else False

model_name_or_path = ('/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/'
                      'gpt2-medium')
model_type = 'gpt2'; abbr_model_name = 'gpt2-medium'

cache_dir = envs['CACHE']
domain_config_path = os.path.abspath('../configs/humanmix_baseline_50kvocab.json')
dataset_dir = preprocessed_data

if not use_doremi:
    per_device_train_batch_size = 2
    gradient_checkpointing = False
else:
    per_device_train_batch_size = 1
    gradient_checkpointing = True

total_batch_size = 128 # # 64*8=512
gradient_accumulation_steps = 1
gradient_accumulation_steps = int(total_batch_size/(num_gpus*nodes)/per_device_train_batch_size)
max_steps = int(200000/total_batch_size); save_steps = 100 # 200k steps.

print(f"Training {model_name_or_path} "
      f"using {num_gpus} GPUs, "
      f"{per_device_train_batch_size} batch size per GPU, "
      f"{gradient_accumulation_steps} gradient accumulation steps,"
      f"for {max_steps} max steps.")


if nodes == 1:
    exe = 'python' if num_gpus==1 else \
        f"""torchrun \
        --nproc_per_node={num_gpus} \
        --master_port=10002"""
else:
    exe = f"""torchrun \
              --nnodes={nodes} \
              --nproc_per_node={num_gpus} \
              --rdzv-id=$SLURM_JOB_ID \
              --rdzv-backend=c10d \
              --rdzv-endpoint=$RDZV_ENDPOINT"""
if test_run:
    exe = f"CUDA_VISIBLE_DEVICES={','.join(map(str, range(num_gpus)))} {exe}"

    
output_dirname = f'{abbr_model_name}_{abbr_train_file}'
if use_doremi:
    output_dirname += '_doremi'
else:
    output_dirname += '_baseline'
if test_run:
    output_dirname = 'jpt_'+output_dirname
run_name = os.path.join(job_name, output_dirname)
output_dir = os.path.join(envs['MODEL_OUTPUT_DIR'], run_name)


if use_doremi:
    reference_model_name_or_path = (
        '/gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/doremi/results/'
        'ft1/jpt_gpt2-medium_humanmix_baseline/checkpoint-10')
    doremi_options = f"""
        --doremi_optimizer=doremiv1 \
        --reweight_eta=1 \
        --reweight_eps=1e-4 \
        --train_domain_weights_tmp_file={os.path.join(output_dir, 'domain_weights')} \
        --reweight_domains \
        --remove_unused_columns=False \
        --reference_model_name_or_path={reference_model_name_or_path} \
    """
else:
    doremi_options = ''

cmd = f"""
    {'!cd .. && ' if test_run else ''}{exe}
    doremi/train.py \
    --model_name_or_path={model_name_or_path} \
    --model_type={model_type} \
    --do_train \
    --cache_dir={cache_dir} \
    --dataset_dir={dataset_dir} \
    --domain_config_path={domain_config_path} \
    --max_token_length=1024 \
    --per_device_train_batch_size={per_device_train_batch_size} \
    --gradient_accumulation_steps={gradient_accumulation_steps} \
    --dataloader_num_workers=1 \
    --learning_rate=2e-5 \
    --lr_scheduler_type=linear \
    --warmup_ratio=0.03 \
    --weight_decay=0. \
    --max_grad_norm=1.0 \
    --max_steps={max_steps} \
    --evaluation_strategy=no \
    --save_strategy=steps \
    --save_steps={save_steps} \
    --save_total_limit=1 \
    --run_name={run_name} \
    --seed=1111 \
    --logging_strategy=steps \
    --logging_steps=10 \
    --logging_first_step \
    --report_to=all \
    --optim=adamw_hf \
    --adam_beta1=0.9 \
    --adam_beta2=0.99 \
    {'--gradient_checkpointing' if gradient_checkpointing  else ''} \
    {'--overwrite_output_dir' if overwrite_output_dir else ''} \
    --torch_dtype=float32 \
    --add_domain_id=True \
    --do_padding=True \
    --fp16 \
    {doremi_options if doremi_options else ''} \
    --output_dir={output_dir} \
"""

if test_run:
    cmds = [x.strip() for x in cmd.split('    ') if x.strip()]
    cmds = ['    '+x if x.startswith('--') else x for x in cmds]
    print()
    print(' \\\n'.join(cmds))
cmd = multiline_to_singleline(cmd)

shell_scripts = shell_scripts_template.format(
    cmd=cmd,
    log_dir=os.getcwd(),
    save_dir=output_dir)
out = submit_job(
    shell_scripts, 
    job_name=job_name, 
    nodes=nodes,
    num_cpus=num_cpus,
    cpu_mem=cpu_mem,
    num_gpus=num_gpus,
    gpu_type=gpu_type,
    test_run=test_run,
    job_duration=job_duration,
)
if not test_run:
    print(out)

Training /gpfs/u/home/PTFM/PTFMqngp/scratch/github/mitibm2023/external/open-instruct/results/baselines/gpt2-medium using 6 GPUs, 2 batch size per GPU, 10 gradient accumulation steps,for 1562 max steps.

Submiting job with:
{
    "job_name": "ft1",
    "nodes": 1,
    "num_cpus": 144,
    "cpu_mem": 512,
    "num_gpus": 6,
    "gpu_type": "v100",
    "test_run": false,
    "queue": "el8",
    "num_jobs": 1
}
[{'args': 'sbatch --job-name=ft1 --partition=el8 --nodes=1 --ntasks-per-node=1 --cpus-per-task=144 --mem=512GB --gres=gpu:6 --output=/gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/scripts/%J.out --time=6:00:00 /gpfs/u/scratch/PTFM/PTFMqngp/github/mitibm2023/external/doremi/scripts/tmpgsl3_eur', 'job_id': 709549}]


In [None]:

# shared: v100_32gb gpu 
#
# train baseline on 200k data
# gpt2, nodes=1, num_gpus=6, micro-bsz=8, no-grad-ckpt, fp32, 1hr
# gpt2-medium, nodes=1, num_gpus=6, micro-bsz=4, no-grad-ckpt, fp32, oom
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp32, 2.5hrs
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp16, 1.5hrs
#
# train doremi on 200k data
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=2, no-grad-ckpt, fp32, oom
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=1, grad-ckpt, fp16, oom
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=1, grad-ckpt, fp32, 4hrs
# gpt2-medium, nodes=1, num_gpus=6, micro-gsz=1, grad-ckpt, fp16, 2.5hrs

