# Code to generate folder for HMC run
May 8, 2023
Options: 
- Start fresh run
- Start with configuration
- Extend run 

In [1]:
import numpy as np
import os
import argparse
import subprocess as sp
import shutil
import glob

import yaml

In [2]:
def f_write_config(input_dict,fname):
    
    with open(fname,'w') as f:
        yaml.dump(input_dict,f,sort_keys=False)
        
def f_read_config(fname):
    
    with open(fname,'r') as f:
        config_dict=yaml.load(f, Loader=yaml.SafeLoader)
        
    return config_dict


In [3]:

def f_get_last_checkpoint(run_dir):
    '''
    Look through saved config files and find last saved configuration number
    '''

    assert os.path.exists(run_dir), "run dir doesn't exist %s"%(run_dir)
    checkpt_list=np.sort([int(i.split('/')[-1].split('_lat.')[-1]) for i in glob.glob(run_dir+'/ckpoint_lat*')])

    return checkpt_list[-1]


def f_get_out_filename(run_dir):
    '''
    Get the right file name for output such as  HSDM1.out, HSDM2.out , etc. based on number of extensions runs done
    
    '''
    count=2
    fname=run_dir+'/HSDM%s.out'%(count)

    while os.path.exists(fname): 
        count+=1
        if count>15:    
            print("Too many extend runs. Please check")
            break

        print("File exists: ",fname)
        fname=run_dir+'HSDM%s.out'%(count)
        print("Modifying output file name to ",fname)

    return fname.split('/')[-1]

In [4]:
def f_build_submit_script(dict_pars,fname,run_type):
    '''
    Build submit script for all run types
    '''
    
    
    if run_type=='fresh':
        app_strg='"$RUN_DIR/build/dweofa_mobius_HSDM --grid {Lx}.{Lx}.{Lx}.{Lt} --mpi {mpi} --shm 2048 --shm-force-mpi 1 --device-mem 5000 --StartingType {start_type} --Trajectories {total_traj} --Thermalizations {therm} $OPT $Ls $traj_l $md_steps $BETA $M_F"'.format(**dict_pars)

    elif run_type in ['checkpoint_start','extend']:
        app_strg='"$RUN_DIR/build/dweofa_mobius_HSDM --grid {Lx}.{Lx}.{Lx}.{Lt} --mpi {mpi} --shm 2048 --shm-force-mpi 1 --device-mem 5000 --StartingType {start_type} --StartingTrajectory {start_traj} --Trajectories {total_traj} --Thermalizations {therm} $OPT $Ls $traj_l $md_steps $BETA $M_F"'.format(**dict_pars)

    else: 
        print("Invalid run_type",run_type)
        raise SystemError

    submit_strg='''#!/bin/bash
################
# Begin LSF Directives
#SBATCH -A latticgc
#SBATCH -t 00:10:00
#SBATCH -J hsdm
#SBATCH -o hsdm.%J
#SBATCH -e hsdm.%J
#SBATCH -N {N}
#SBATCH -n {nprocs}
#SBATCH --exclusive
#SBATCH --gpu-bind=map_gpu:0,1,2,3,7,6,5,4
#SBATCH -c 1
###SBATCH --threads-per-core=1

echo "--start " `date` `date +%s`

echo "GRID_DIR= $GRID_DIR"
echo "RUN_DIR= $RUN_DIR"

export BIND="--cpu-bind=verbose,map_ldom:3,3,1,1,2,2,0,0"

export MPIR_CVAR_GPU_EAGER_DEVICE_MEM=16384
export MPICH_GPU_SUPPORT_ENABLED=1
export MPICH_SMP_SINGLE_COPY_MODE=CMA
export OMP_NUM_THREADS=8
export MPICH_OFI_NIC_POLICY=GPU
export OPT="--comms-concurrent --comms-overlap "

source $GRID_DIR/setup_env.sh
export TSAN_OPTIONS='ignore_noninstrumented_modules=1'
export LD_LIBRARY_PATH

Ls={dwf_Ls}
traj_l={traj_l}
md_steps={md_steps}
BETA={beta}
M_F={mf}

APP={app}

#srun --gpus-per-task 1 -n64 $BIND $APP > HSDM.out
#srun --gpus-per-task 1 -n64 $APP > HSDM.out
srun -n{nprocs} -o {out_file} $APP

echo "--end " `date` `date +%s`

    '''.format(app=app_strg,**dict_pars)
    with open(fname,'w') as f: f.write(submit_strg)


In [5]:
def f_create_dict_pars(run_type,run_dir):
    '''
    Return dictionary dict_pars with details of run
    Doesn't not modify any files
    '''
    
    assert run_type in ['fresh','checkpoint_start','extend'], "Invalid run_type %s"%(run_type)
    
    
    if run_type=='fresh': ## Fresh run with parameters specified below. Pick start_type 

        Lx=24
        Lt=12
        N=1
        mx,my,mz,mt=2,2,2,1
        Ls=16
        dict_pars={
            'Lx':Lx, 'Lt':Lt, # Lattice size 
            'F_action': 'Mobius_dwf',

            'traj_l':2, 'md_steps':15, 
        #     'beta':beta, 'mf':mf, 
            'dwf_Ls':Ls, 
            'mpi':".".join([str(i) for i in [mx,my,mz,mt]]),
            'nprocs': mx*my*mz*mt,
            'N':N,  ## Number of nodes
            'total_traj': 700,
            'start_type': 'ColdStart',   # Valid [HotStart, ColdStart, TepidStart, CheckpointStart]
            'therm': 10,
            'out_file': 'HSDM1.out'
                  }

    elif run_type=='checkpoint_start': # Start new run with a starting config file

        last=f_get_last_checkpoint(run_dir)
        print("Last",last)
        config_file=run_dir+'ckpoint_lat.%s'%(last)
        assert os.path.isfile(config_file) ,"File doesn't exist"

        Lx=24; Lt=12
        N=1
        mx,my,mz,mt=2,2,2,1
        Ls=16
        dict_pars={
            'Lx':Lx, 'Lt':Lt, # Lattice size 
            'F_action': 'Mobius_dwf',

            'traj_l':2, 'md_steps':15, 
    #         'beta':14.0, 'mf':0.1, 
            'dwf_Ls':Ls, 
            'mpi':".".join([str(i) for i in [mx,my,mz,mt]]),
            'nprocs': mx*my*mz*mt,
            'N':N, 
            'total_traj': 200,
            'start_type': 'CheckpointStart',
            'start_traj': 0,
            'start_config': config_file,
            'therm':0, ## Thermalization ( non-zero for fresh run )
            'out_file': 'HSDM1.out'
                  }

    elif run_type=="extend": # Extend run with same paramters with last saved configuration

        last=f_get_last_checkpoint(run_dir)
        print(last)
        config_file=run_dir+'ckpoint_lat.%s'%(last)

        print("Last checkpoint",last)

        input_dict=f_read_config(run_dir+'/config.yaml')

        dict_pars={}
        for key in input_dict.keys():
            dict_pars[key]=input_dict[key]

        dict_pars.update({
            'start_type': 'CheckpointStart',   # Valid [HotStart, ColdStart, TepidStart, CheckpointStart]
            'start_traj': f_get_last_checkpoint(run_dir),
            'out_file': f_get_out_filename(run_dir),
            'total_traj': 700,
            'therm': 0,
            'starg_config': config_file,
                  })
    
    return dict_pars


In [6]:
def f_build_run_dir(run_type,run_dir):
    '''
    Build run directory and copy files
    '''
    
    if run_type=='extend':
        assert os.path.exists(run_dir), "Top directory %s doesn't exist"%(run_dir)
        fname=run_dir+'/submit_3_dwf-hsdm.sh'
        f_build_submit_script(dict_pars,fname,run_type)

    # Can create folders and files for a set of couplings
    elif run_type in ['fresh','checkpoint_start'] : 
        # beta_list=[10.0+0.2*i for i in range(11)]
        beta_list=[13.0,13.5,14.0]
        m_list=[0.1]

        for beta in beta_list:
            for mf in m_list:
                dict_pars['beta']=beta
                dict_pars['mf']=mf
                
                strg='run_Lx-%s_Lt-%s_Ls-%s_beta-%s_mf-%s'%(dict_pars['Lx'],dict_pars['Lt'],dict_pars['dwf_Ls'],beta,mf)
                fldr=run_dir+'%s/'%(strg)

                print(fldr)

                if os.path.exists(fldr): 
                    print("Error: Directory %s exists",fldr)
                    raise SystemError
                else:
                    os.makedirs(run_dir+'/{0}'.format(strg))
                    fname=fldr+'submit_2_dwf-hsdm.sh'
                    f_build_submit_script(dict_pars,fname,run_type)

                    fname=fldr+'config.yaml'
                    f_write_config(dict_pars,fname)

                if run_type=='checkpoint_start':
                    print("Copy start config from the last config %s in %s"%(last,run_dir))
                    shutil.copy(run_dir+'/'+'ckpoint_lat.%s'%(last), fldr+'/ckpoint_lat.0')
                    shutil.copy(run_dir+'/'+'ckpoint_rng.%s'%(last), fldr+'/ckpoint_rng.0')


    else: 
        print("Invalid run_type",run_type)
        raise SystemError

## Specify run folder and type

In [7]:
# run_dir='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/'
# run_dir='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_5_hot_start/'
# run_dir='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/scaling_tests/'

# run_type='fresh'

# run_type='checkpoint_start'


In [16]:
# run_dir='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_5_hot_start/run_Lx-16_Lt-8_Ls-16_beta-11.0_mf-0.1/'

run_dir='/usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1/'

run_type='extend'


In [17]:
### Options ['fresh','extend','checkpoint_start']
dict_pars=f_create_dict_pars(run_type,run_dir)
print(dict_pars)

1475
Last checkpoint 1475
File exists:  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1//HSDM2.out
Modifying output file name to  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1/HSDM3.out
File exists:  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1/HSDM3.out
Modifying output file name to  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1/HSDM4.out
File exists:  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_Ls-16_beta-11.4_mf-0.1/HSDM4.out
Modifying output file name to  /usr/workspace/lsd/ayyar1/projects/SU4_sdm/runs_Grid/2023_july12/runs/phase_diagram_4_cold_start/run_Lx-16_Lt-8_L

## Build directories and copy files

In [18]:
f_build_run_dir(run_type,run_dir)