In [2]:
import os
from math import ceil
from time import sleep
from itertools import product
import re
import shutil
import subprocess
from threading import Event
import signal
from IPython.display import clear_output
from time import sleep


In [3]:
ORIGIN_DIR = "/gpfsdswork/projects/idris/sos/ssos039/bench_pie/benches"

In [14]:
def write_slurm_string(parameters):
    
    model_name = parameters["model_name"]
    training_dist = parameters["training_dist"]
    nbr_gpu = parameters["nbr_gpu"]
    
    batch_size = parameters["batch_size"]
    max_length = parameters["max_length"]
    is_debug = parameters["is_debug"]
    
    if is_debug:
        debug = "--debug"
    else:
        debug = ""
    
    if nbr_gpu == 16:
        nbr_node = 2
        nbr_gpu = 8
    else:
        nbr_node = 1
        
    if nbr_gpu == 8:
        exclu = "#SBATCH --exclusive"
    else:
        exclu = ""
        
    if training_dist == "fsdp":
        stage = 1
        ddp = training_dist
    else:
        stage = training_dist[-1]
        ddp = training_dist[:-1]
        
    slurm_string = f"""#!/bin/sh

#SBATCH --job-name=BenchPieVictor
#SBATCH --account=sos@a100
#SBATCH -C a100

#SBATCH --nodes={nbr_node}
#SBATCH --cpus-per-task=8
#SBATCH --ntasks-per-node={nbr_gpu}
#SBATCH --gres=gpu:{nbr_gpu}

#SBATCH --output=output.out
#SBATCH --error=error.err

#SBATCH --hint=nomultithread
#SBATCH --time=00:30:00
#SBATCH --qos=qos_gpu-t3

{exclu}


##conda deactivate
module purge

## load Pytorch module
module load cpuarch/amd
module load pytorch-gpu/py3/2.0.1

export PATH=/gpfswork/idris/sos/ssos039/.local/bin:$PATH

## launch script on every node
set -x

# code execution
srun pie train \
-c ../../config.json \
--model_name {model_name} \
--training_dist {ddp} \
--stage {stage} \
--batch_size {batch_size} \
--seq_length {max_length} \
{debug} \
"""
    return slurm_string

In [5]:
def write_slurm_file(config):
    name = f"bench_pie"
    for key,value in config.items():
        name = name+f"--{re.sub('[^0-9a-zA-Z]+', '_', str(key))}-{re.sub('[^0-9a-zA-Z]+', '_', str(value))}"
    
    path = os.path.join(ORIGIN_DIR, name)
    if not os.path.exists(path):
        os.makedirs(path)
    # shutil.copy("/gpfsdswork/projects/idris/sos/ssos039/benchmarks_throughput_OptimDDP_PEFT/benchmarks_optimDDP_peft_bf16/benchmarks_optimDDP_peft_bf16.py",path)
    os.chdir(path)
    
    with open(f"slurm_file.slurm","w") as f:
        f.write(write_slurm_string(config))
        
    return name

In [6]:
def all_combinations(configs):
    keys, values = zip(*parameters.items())
    comb_parameters = [dict(zip(keys, p)) for p in product(*values)]
    combinated_parameters=[]
    for i in range(len(comb_parameters)):
        combinated_parameters.append(comb_parameters[i])
    return combinated_parameters  

In [7]:
def launch_bench(configs):
    for config in configs:
        name = write_slurm_file(config)
        os.system(f"sbatch slurm_file.slurm")
        print('Last training: ' + name)
        sleep(1)

In [15]:
# parameters={

#     'model_name': ["meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf"],
#     'training_dist': ["deepspeed2", "deepspeed3", "fsdp"],
#     'nbr_gpu': [4, 8, 16],
    
#     'batch_size': [1, 2, 4, 8, 16, 32, 64],

#     'max_length': [1024],
#     'is_debug': [True, False]
# }

parameters={

    'model_name': ["meta-llama/Llama-2-13b-hf"],
    'training_dist': ["fsdp", "deepspeed3"],
    'nbr_gpu': [16],
    
    'batch_size': [1, 2, 3],

    'max_length': [1024],
    'is_debug': [True]
}

In [16]:
all_configs = all_combinations(parameters)
launch_bench(all_configs)

Submitted batch job 1480754
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-fsdp--nbr_gpu-16--batch_size-1--max_length-1024--is_debug-True
Submitted batch job 1480757
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-fsdp--nbr_gpu-16--batch_size-2--max_length-1024--is_debug-True
Submitted batch job 1480758
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-fsdp--nbr_gpu-16--batch_size-3--max_length-1024--is_debug-True
Submitted batch job 1480759
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-deepspeed3--nbr_gpu-16--batch_size-1--max_length-1024--is_debug-True
Submitted batch job 1480762
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-deepspeed3--nbr_gpu-16--batch_size-2--max_length-1024--is_debug-True
Submitted batch job 1480763
Last training: bench_pie--model_name-meta_llama_Llama_2_13b_hf--training_dist-deepspeed3--nbr_gpu-16--batch_size-3-