# Launch LBANN traing jobs
Code to create a batch script for launching jobs on cori GPU

Sep 1, 2020

In [1]:
import os
import glob,time
import subprocess as sp
import numpy as np

import yaml


In [2]:
curr_dir=os.getcwd()
print(curr_dir)

/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/run_scripts


## Define machine and code 

In [3]:
machine='cori'
img_size=128
assert machine in ['cori','summit'], "Error%s"%(machine)

## Read ref_launch.yaml and define dictionary

In [5]:
config_file='ref_launch.yaml'

with open(config_file) as f:
    config_dict= yaml.load(f, Loader=yaml.SafeLoader)
    
    
## Read ref_launch.yaml
dict_pars=config_dict[machine][img_size]
dict_pars.update({'nodes':1,'time':'3:00:00','job_name':'train'})
print(dict_pars)

{'install_loc': '/global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223', 'spack_loc': '/global/cfs/cdirs/m3363/lbann/tom_spack', 'code_dir': '/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/main_code/', 'config': 'config_cori_128.yaml', 'staging_loc': '/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/', 'nodes': 1, 'time': '3:00:00', 'job_name': 'train'}


In [6]:
cori_strg='''#!/bin/bash
#################
#SBATCH --nodes={nodes}
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=8 
#SBATCH --cpus-per-task=8 
#SBATCH --gpus-per-task=1
#SBATCH --time={time}
#SBATCH --job-name={job_name}

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module purge
module load cgpu
module load modules/3.2.11.4 gcc/8.3.0 cuda/10.2.89 mvapich2/2.3.2 cmake/3.14.4 python/3.7-anaconda-2019.10

module use {install_loc}/etc/modulefiles
module load lbann

export MKL_THREADING_LAYER=GNU

export SPACK_ROOT={spack_loc}
$SPACK_ROOT/share/spack/setup-env.sh
export MV2_ENABLE_AFFINITY=0
export MV2_USE_CUDA=1
export IBV_FORK_SAFE=1

### Run the main code
code_dir={code_dir}
export config_file=$code_dir'/{config}'
python $code_dir/train_exagan.py

echo "--end date" `date` `date +%s`
'''


### String for summit
summit_strg='''


echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME


### Run the main code
code_dir={code_dir}
export config_file=$code_dir'/{config}'
python $code_dir/train_exagan.py

echo "--end date" `date` `date +%s`
'''

In [7]:
# Build bash string
if machine=='cori':
    bash_strg=cori_strg.format(**dict_pars)
elif machine=='summit':
    bash_strg=summit_strg.format(**dict_pars)


In [8]:
fname='batch_train_test.sh'
filename=dict_pars['staging_loc']+fname
with open (filename,'w') as f:
    f.write(bash_strg)
print(filename)

/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_data/batch_train_test.sh


In [9]:
### Move to staging locations in project space:
os.chdir(dict_pars['staging_loc'])

In [10]:
%%bash -s "$filename" ## Use python variable in bash
cat $1
chmod +x $1

#!/bin/bash
#################
#SBATCH --nodes=1
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=8 
#SBATCH --cpus-per-task=8 
#SBATCH --gpus-per-task=1
#SBATCH --time=3:00:00
#SBATCH --job-name=train

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module purge
module load cgpu
module load modules/3.2.11.4 gcc/8.3.0 cuda/10.2.89 mvapich2/2.3.2 cmake/3.14.4 python/3.7-anaconda-2019.10

module use /global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223/etc/modulefiles
module load lbann

export MKL_THREADING_LAYER=GNU

export SPACK_ROOT=/global/cfs/cdirs/m3363/lbann/tom_spack
$SPACK_ROOT/share/spack/setup-env.sh
export MV2_ENABLE_AFFINITY=0
export MV2_USE_CUDA=1
export IBV_FORK_SAFE=1

### Run the main code
code_dir=/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/lbann_cosmogan/1_train/main_code/
export config_file=$code_dir'/config_cori_128.yaml

## Submit job to cori GPU

In [11]:
# %%bash -s "$filename" ## Use python variable in bash
# module load cgpu
# sbatch $1