# Launch LBANN traing jobs
Code to create a batch script for launching jobs on cori GPU

Sep 1, 2020

In [1]:
import os
import glob,time
import subprocess as sp
import numpy as np

import yaml


In [2]:
start_dir=os.getcwd()
print(start_dir)

/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/new_run_scripts


## Define machine and code 

In [3]:
machine='cori'
img_size=128
assert machine in ['cori','summit'], "Error%s"%(machine)

## Read ref_launch.yaml and define dictionary

In [4]:
launch_cfile=start_dir+'/ref_launch.yaml'

with open(launch_cfile) as f:
    config_dict= yaml.load(f, Loader=yaml.SafeLoader)

In [28]:
## Read ref_launch.yaml
dict_pars=config_dict[machine][img_size]
dict_pars.update({'nodes':1,'gpus_per_node':8,'time':'4:00:00','job_name':'ddp_1node_2_basic_loss'})

dict_pars['cpus_per_node']=dict_pars['gpus_per_node']*10
print(dict_pars)


{'install_loc': '/global/cfs/cdirs/m3363/lbann/tom_lbann_install_20210223', 'spack_loc': '/global/cfs/cdirs/m3363/lbann/tom_spack', 'code_dir': '/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/1_main_code/DDP_new_loss/', 'config': 'config_2dgan.yaml', 'staging_loc': '/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_from_other_code/pytorch/results/', 'nodes': 1, 'gpus_per_node': 8, 'time': '4:00:00', 'job_name': 'ddp_1node_2_basic_loss', 'cpus_per_node': 80}


In [29]:
if dict_pars['nodes']==1: 
    cori_strg='''#!/bin/bash
#################
#SBATCH --nodes={nodes}
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task={cpus_per_node}
#SBATCH --gpus-per-task={gpus_per_node}
#SBATCH --time={time}
#SBATCH --job-name={job_name}

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module load cgpu
module load pytorch/v1.6.0-gpu  

### Run the main code
code_dir={code_dir}

srun -n1 python -m torch.distributed.launch --nproc_per_node={gpus_per_node} $code_dir/main.py --config $code_dir/{config}
echo "--end date" `date` `date +%s`
    '''
else: 
    cori_strg='''#!/bin/bash -l
#################
#SBATCH --nodes={nodes}
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task={cpus_per_node}
#SBATCH --gpus-per-task={gpus_per_node}
#SBATCH --time={time}
#SBATCH --job-name={job_name}

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Set up
# Configuration
nproc_per_node={gpus_per_node}
code_dir={code_dir}
# Load software
module load cgpu
module load pytorch/1.7.0-gpu

# Setup node list
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
nodes_array=( $nodes )
master_node=${{nodes_array[0]}}
master_addr=$(srun --nodes=1 --ntasks=1 -w $master_node hostname --ip-address)
worker_num=$(($SLURM_JOB_NUM_NODES))

echo $nodes
echo $master_addr
echo $worker
# Loop over nodes and submit training tasks
for ((  node_rank=0; node_rank<$worker_num; node_rank++ ))
do
  node=${{nodes_array[$node_rank]}}
  echo "Submitting node # $node_rank, $node"

  # Launch one SLURM task per node, and use torch distributed launch utility
  # to spawn training worker processes; one per GPU
  srun -N 1 -n 1 -w $node python -m torch.distributed.launch \
    --nproc_per_node=$nproc_per_node --nnodes=$SLURM_JOB_NUM_NODES \
    --node_rank=$node_rank --master_addr=$master_addr \
    ${{code_dir}}/main.py --config ${{code_dir}}{config} &

  pids[${{node_rank}}]=$!
done

# Wait for completion
for pid in ${{pids[*]}}
do
    wait $pid
done

echo "--end date" `date` `date +%s`
    '''

### String for summit
summit_strg='''

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME


### Run the main code
code_dir={code_dir}
export config_file=$code_dir'/{config}'
python $code_dir/train_exagan.py

echo "--end date" `date` `date +%s`
'''

In [30]:
# Build bash string
if machine=='cori':
    bash_strg=cori_strg.format(**dict_pars)
elif machine=='summit':
    bash_strg=summit_strg.format(**dict_pars)


In [31]:
fname='batch_train_ddptest.sh'
filename=dict_pars['staging_loc']+fname
with open (filename,'w') as f:
    f.write(bash_strg)
print(filename)

/global/cfs/cdirs/m3363/vayyar/cosmogan_data/results_from_other_code/pytorch/results/batch_train_ddptest.sh


In [32]:
### Move to staging locations in project space:
# os.chdir(dict_pars['staging_loc'])

In [33]:
%%bash -s "$filename" ## Use python variable in bash
cat $1
chmod +x $1

#!/bin/bash
#################
#SBATCH --nodes=1
#SBATCH --qos=regular
#SBATCH --output=slurm-%x-%j.out
#SBATCH --constraint=gpu
#SBATCH --account=m3363
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=80
#SBATCH --gpus-per-task=8
#SBATCH --time=4:00:00
#SBATCH --job-name=ddp_1node_2_basic_loss

echo "--start date" `date` `date +%s`
echo '--hostname ' $HOSTNAME

### Initial setup
module load cgpu
module load pytorch/v1.6.0-gpu  

### Run the main code
code_dir=/global/u1/v/vpa/project/jpt_notebooks/Cosmology/Cosmo_GAN/repositories/cosmogan_pytorch/code/1_basic_GAN/1_main_code/DDP_new_loss/

srun -n1 python -m torch.distributed.launch --nproc_per_node=8 $code_dir/main.py --config $code_dir/config_2dgan.yaml
echo "--end date" `date` `date +%s`
    

## Submit job to cori GPU

In [34]:
# %%bash -s "$filename" ## Use python variable in bash
# module load cgpu
# sbatch $1

In [35]:
200000*0.25/(16)

3125.0