# Launch LBANN traing jobs
Code to create a batch script for launching jobs on cori GPU

Sep 1, 2020

In [1]:
import os
import glob,time
import subprocess as sp
import numpy as np

import yaml


In [2]:
curr_dir=os.getcwd()
print(curr_dir)

/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/run_scripts


## Define machine and code 

In [3]:
machine='cori'
img_size=128
assert machine in ['cori','summit'], "Error%s"%(machine)

## Read ref_launch.yaml and define dictionary

In [4]:
config_file='ref_launch.yaml'

with open(config_file) as f:
    config_dict= yaml.load(f, Loader=yaml.SafeLoader)
    
## Read ref_launch.yaml
dict_pars=config_dict[machine][img_size]

In [5]:
! ls /global/cscratch1/sd/vpa/proj/cosmogan/results_dir/128square/


20200911_083711_bsize64_spec_test_128_nospec
20200911_084729_bsize64_spec_test_128_withspec
20200914_220904_bsize64_spec_test_128_withspec
20201202_200004_bsize256_scale0.5
20201203_084645_bsize256_scale0.1
20201208_165658_bsize256_scale0.1_deterministic_on
20201209_055955_bsize256_scale0.1_deterministic_on
20210202_061803_bsize256_scale0.1_deterministic_off
20210226_175209_bsize256_scale0.1_deterministic_off
20210407_110817_bsize128_train
20210407_131151_bsize128_train_layer_norm_100kdata
20210408_100355_bsize128_layer-norm_200kdata
20210409_083631_bsize128_with_bnorm_200kdata
20210422_132856_bsize128_bnorm_new_decay
20210507_084712_bsize128_bnorm_new_decay
norm_1_train_val.npy
scaling_runs
test


In [7]:
# 13,18850
# 17,24250
# 18,25800
lst=[(9, 13050), (10, 15450), (11, 15800), (11, 16350), (11, 16500), 
 (12, 18200), (16, 22950), (16, 23200), (17, 24150), (19, 27650)]

In [11]:
epoch,step=lst[0]
epoch,step

(9, 13050)

In [48]:
epoch,step=lst[8]

In [49]:
fldr='20210507_084712_bsize128_bnorm_new_decay'
data_dir='/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/{3}square/{0}/chkpt/trainer0/sgd.shared.training_batch_begin.epoch.{1}.step.{2}/model0/'.format(fldr,epoch,step,img_size)

dict_pars.update({'nodes':1,'time':'10','job_name':'gen_img',
                 'data_dir':data_dir})
print(dict_pars)

{'install_loc': '/global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223', 'spack_loc': '/global/cfs/cdirs/m3363/lbann/tom_spack', 'code_dir': '/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/main_code/', 'config': 'config_cori_128.yaml', 'staging_loc': '/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/', 'nodes': 1, 'time': '10', 'job_name': 'gen_img', 'data_dir': '/global/cscratch1/sd/vpa/proj/cosmogan/results_dir/128square/20210507_084712_bsize128_bnorm_new_decay/chkpt/trainer0/sgd.shared.training_batch_begin.epoch.17.step.24150/model0/'}


In [50]:
cori_strg='''#!/bin/bash
#################
#SBATCH --nodes={nodes}
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=8 
#SBATCH --cpus-per-task=8 
#SBATCH --gpus-per-task=1
#SBATCH --time={time}
#SBATCH --job-name={job_name}

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module purge
module load cgpu
module load modules/3.2.11.4 gcc/8.3.0 cuda/10.2.89 mvapich2/2.3.2 cmake/3.14.4 python/3.7-anaconda-2019.10

module use {install_loc}/etc/modulefiles
module load lbann

export MKL_THREADING_LAYER=GNU

export SPACK_ROOT={spack_loc}
$SPACK_ROOT/share/spack/setup-env.sh
export MV2_ENABLE_AFFINITY=0
export MV2_USE_CUDA=1
export IBV_FORK_SAFE=1

### Run the main code
code_dir={code_dir}
data_dir={data_dir}
export config_file=$code_dir'/{config}'
python $code_dir/test_exagan.py -dr $data_dir

echo "--end date" `date` `date +%s`
'''


In [51]:
# Build bash string
if machine=='cori':
    bash_strg=cori_strg.format(**dict_pars)
elif machine=='summit':
    bash_strg=summit_strg.format(**dict_pars)


In [52]:
fname='batch_inference.sh'
filename=dict_pars['staging_loc']+fname
with open (filename,'w') as f:
    f.write(bash_strg)
print(filename)

/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/batch_inference.sh


In [53]:
### Move to staging locations in project space:
os.chdir(dict_pars['staging_loc'])

In [54]:
%%bash -s "$filename" ## Use python variable in bash
cat $1
chmod +x $1

#!/bin/bash
#################
#SBATCH --nodes=1
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=8 
#SBATCH --cpus-per-task=8 
#SBATCH --gpus-per-task=1
#SBATCH --time=10
#SBATCH --job-name=gen_img

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module purge
module load cgpu
module load modules/3.2.11.4 gcc/8.3.0 cuda/10.2.89 mvapich2/2.3.2 cmake/3.14.4 python/3.7-anaconda-2019.10

module use /global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223/etc/modulefiles
module load lbann

export MKL_THREADING_LAYER=GNU

export SPACK_ROOT=/global/cfs/cdirs/m3363/lbann/tom_spack
$SPACK_ROOT/share/spack/setup-env.sh
export MV2_ENABLE_AFFINITY=0
export MV2_USE_CUDA=1
export IBV_FORK_SAFE=1

### Run the main code
code_dir=/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/main_code/
data_dir=/global/cscratch1/sd/vpa/proj/cosmogan/resul

## Submit job to cori GPU

In [55]:
%%bash -s "$filename" ## Use python variable in bash
module load cgpu
sbatch $1

Submitted batch job 1894190


In [56]:
ls

[0m[01;34m20200529_111342_seed3273_80epochs[0m/  slurm-analysis-41827872.out
[01;34m20200626_075510_batchsize_256[0m/      slurm-analysis-42443214.out
[01;32mbatch_analysis.sh[0m*                  slurm-train-1793075.out
[01;32mbatch_inference.sh[0m*                 slurm-train-1798852.out
[01;32mbatch_inference_test.sh[0m*            slurm-train-1835698.out
[01;32mbatch_train.sh[0m*                     slurm-train-1893231.out
[01;32mbatch_train_test.sh[0m*                [01;32mstartup.sh[0m*
[01;34mold_run_files[0m/                      [01;32mtom_batch.sh[0m*
