# Launch LBANN traing jobs
Code to create a batch script for launching jobs on cori GPU

Sep 1, 2020

In [1]:
import os
import glob,time
import subprocess as sp
import numpy as np

import yaml


In [2]:
start_dir=os.getcwd()
print(start_dir)

/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/new_run_scripts


## Define machine and code 

In [3]:
facility='cori'
img_size=128
run_type='2dgan'
assert facility in ['cori','summit'], "Error%s"%(facility)
assert img_size in [128,512]
assert run_type in ['2dgan','2dcgan','3dgan','3dcgan']

## Read ref_launch.yaml and define dictionary

In [4]:
launch_cfile=start_dir+'/ref_launch.yaml'

with open(launch_cfile) as f:
    config_dict= yaml.load(f, Loader=yaml.SafeLoader)

In [28]:
## Read ref_launch.yaml
dict_pars=config_dict[machine][img_size]
dict_pars.update({'nodes':1,'gpus_per_node':8,'time':'4:00:00','job_name':'ddp_1node_2_basic_loss'})

print(dict_pars)


{'install_loc': '/global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223', 'spack_loc': '/global/cfs/cdirs/m3363/lbann/tom_spack', 'code_dir': '/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/1_main_code/DDP_new_loss/', 'config': 'config_2dgan.yaml', 'staging_loc': '/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_from_other_code/pytorch/results/', 'nodes': 1, 'gpus_per_node': 8, 'time': '4:00:00', 'job_name': 'ddp_1node_2_basic_loss', 'cpus_per_node': 80}


In [None]:
cori_strg='''
#!/bin/bash
#################
#SBATCH --nodes={nodes}
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node={gpus_per_node}
#SBATCH --cpus-per-task=10
#SBATCH --gpus-per-task=1
#SBATCH --time={time}
#SBATCH --job-name={job_name}

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

#nGPU_per_node=$SLURM_GPUS_PER_TASK 

### Initial setup
module load cgpu
module load pytorch/v1.6.0-gpu
#conda activate v3

N=${SLURM_NNODES}
nprocspn=${SLURM_NTASKS_PER_NODE}
G=$[ $N * $nprocspn ]
echo S:`hostname` G=$G  N=$N
export MASTER_ADDR=`hostname --ip-address`
echo S:use MASTER_ADDR=$MASTER_ADDR

### Run the main code
code_dir={code_dir}

export CMD=" python -u $code_dir/main.py --config $code_dir/config_2dgan.yaml --facility cori --ddp"
srun -n $G bash -c ' ${CMD} '

echo "--end date" `date` `date +%s`
'''


In [None]:
summit_strg='''
#!/bin/bash
# Begin LSF directives
#BSUB -P AST153
#BSUB -J {job_name} 
#BSUB -o lsf-2dgan.%J
#BSUB -W {time}
#BSUB -nnodes {nodes}
#BSUB -alloc_flags "nvme smt4"
# End LSF directives and begin shell commands

NODES=$(cat ${LSB_DJOB_HOSTFILE} | sort | uniq | grep -v login | grep -v batch | wc -l)

module load open-ce
# add this 2 lines to run inside conda with my privately compiled Apex
module load gcc/8
conda activate cloned_env

export OMP_NUM_THREADS=1
export NCCL_DEBUG=INFO

config=bs1024-smt
ver=S  # train.py version
jobId=${LSB_JOBID}

code_dir={code_dir}
config_file=$code_dir/'config_2dgan_summit.yaml'

CMD=" python -u $code_dir/main.py --config config_file --facility summit --ddp "
echo "S:CMD=$CMD"

SMPIARGS="--smpiargs "off""
jsrun -n${NODES} -a6 -c42 -g6 -r1 $SMPIARGS --bind=proportional-packed:7 --launch_distribution=packed ./launchSummit.sh "$CMD"
'''

In [30]:
# Build bash string
if machine=='cori':
    bash_strg=cori_strg.format(**dict_pars)
elif machine=='summit':
    bash_strg=summit_strg.format(**dict_pars)


In [31]:
fname='batch_train_ddptest.sh'
filename=dict_pars['staging_loc']+fname
with open (filename,'w') as f:
    f.write(bash_strg)
print(filename)

/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_from_other_code/pytorch/results/batch_train_ddptest.sh


In [32]:
### Move to staging locations in project space:
# os.chdir(dict_pars['staging_loc'])

In [33]:
%%bash -s "$filename" ## Use python variable in bash
cat $1
chmod +x $1

#!/bin/bash
#################
#SBATCH --nodes=1
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=80
#SBATCH --gpus-per-task=8
#SBATCH --time=4:00:00
#SBATCH --job-name=ddp_1node_2_basic_loss

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module load cgpu
module load pytorch/v1.6.0-gpu  

### Run the main code
code_dir=/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/1_main_code/DDP_new_loss/

srun -n1 python -m torch.distributed.launch --nproc_per_node=8 $code_dir/main.py --config $code_dir/config_2dgan.yaml
echo "--end date" `date` `date +%s`
    

## Submit job to cori GPU

In [34]:
# %%bash -s "$filename" ## Use python variable in bash
# module load cgpu
# sbatch $1

In [35]:
200000*0.25/(16)

3125.0