# Code to generate submit script for extending HMC run
July 20, 2023

In [1]:
import numpy as np
import os
import argparse
import subprocess as sp
import shutil
import glob

import yaml

In [2]:
def f_write_config(input_dict,fname):
    
    with open(fname,'w') as f:
        yaml.dump(input_dict,f,sort_keys=False)
        
        
def f_read_config(fname):
    
    with open(fname,'r') as f:
        config_dict=yaml.load(f, Loader=yaml.SafeLoader)
        
    return config_dict


In [3]:
def f_build_submit_script(dict_pars,fname):
    
    submit_strg='''#!/bin/bash
################
# Begin LSF Directives
#SBATCH -A latticgc
#SBATCH -t 00:10:00
#SBATCH -J hsdm
#SBATCH -o hsdm.%J
#SBATCH -e hsdm.%J
#SBATCH -N {N}
#SBATCH -n {nprocs}
#SBATCH --exclusive
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
#SBATCH -c 1
###SBATCH --threads-per-core=1

echo "--start " `date` `date +%s`

export BIND="--cpu-bind=verbose,map_ldom:3,3,1,1,2,2,0,0"

export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=16384
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=CMA
export OMP_NUM_THREADS=8
export MPICH_OFI_NIC_POLICY=GPU
export OPT="--comms-concurrent --comms-overlap "

source $GRID_DIR/setup_env.sh
export TSAN_OPTIONS='ignore_noninstrumented_modules=1'
export LD_LIBRARY_PATH

Ls={dwf_Ls}
traj_l={traj_l}
md_steps={md_steps}
BETA={beta}
M_F={mf}

APP="$RUN_DIR/build/dweofa_mobius_HSDM --grid {Lx}.{Lx}.{Lx}.{Lt} --mpi {mpi} --shm 2048 --shm-force-mpi 1 --device-mem 5000 --StartingType {start_type} --StartingTrajectory {start_traj} --Trajectories {total_traj} --Thermalizations 0 $OPT $Ls $traj_l $md_steps $BETA $M_F"
#srun --gpus-per-task 1 -n64 $BIND $APP > HSDM.out
#srun --gpus-per-task 1 -n64 $APP > HSDM.out
srun -n{nprocs} -o {out_file} $APP

echo "--end " `date` `date +%s`

    '''.format(**dict_pars)
    with open(fname,'w') as f: f.write(submit_strg)


In [4]:
def f_get_last_checkpoint(run_dir):

    checkpt_list=np.sort([int(i.split('/')[-1].split('_lat.')[-1]) for i in glob.glob(run_dir+'/ckpoint_lat*')])

    return checkpt_list[-1]

def f_get_out_filename(run_dir):
    count=2
    fname=run_dir+'/HSDM%s.out'%(count)

    while os.path.exists(fname): 
        count+=1
        if count>5:    
            print("Too many extend runs. Please check")
            break

        print("File exists: ",fname)
        fname=run_dir+'/HSDM%s.out'%(count)
        print("Modifying output file name to ",fname)

    return fname.split('/')[-1]
    

In [5]:
# fldr='/usr/WS2/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-12.5_mf-0.1'

fldr='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_5_hot_start/run_Lx-16_Lt-8_Ls-16_beta-13.0_mf-0.1'


In [6]:

print("Last checkpoint",f_get_last_checkpoint(fldr))
# f_get_out_filename(fldr)

input_dict=f_read_config(fldr+'/config.yaml')

dict_pars={}
for key in input_dict.keys():
    dict_pars[key]=input_dict[key]
    
dict_pars.update({
    'start_type': 'CheckpointStart',   # Valid [HotStart, ColdStart, TepidStart, CheckpointStart]
    'start_traj': f_get_last_checkpoint(fldr),
    'out_file': f_get_out_filename(fldr),
    'total_traj': 400
          })
    
print(dict_pars)

Last checkpoint 200
{'Lx': 16, 'Lt': 8, 'F_action': 'Mobius_dwf', 'traj_l': 2, 'md_steps': 15, 'dwf_Ls': 16, 'mpi': '2.2.2.1', 'nprocs': 8, 'N': 1, 'start_type': 'CheckpointStart', 'beta': 13.0, 'mf': 0.1, 'start_traj': 200, 'out_file': 'HSDM2.out', 'total_traj': 400}


In [7]:
assert os.path.exists(fldr), "Top directory %s doesn't exist"%(fldr)
fname=fldr+'/submit_3_dwf-hsdm.sh'
f_build_submit_script(dict_pars,fname)
