In [1]:
from pathlib import Path
import pandas as pd

In [2]:
def make_sbc(config, sbc_name, cmds):
    with open(sbc_name, 'w') as fh:
        fh.write('#!/bin/bash\n')
        
        # SBATCH parameters
        account = config['account']
        fh.write(f'#SBATCH --account {account}\n')
        
        for key, val in config.items():
            if key == 'account':
                continue
            fh.write(f'#SBATCH --{key}={val}\n')

        fh.write('\n')

        for cmd in cmds:
            fh.write(f'srun --unbuffered {cmd}\n')

In [3]:
epoch = 25
time = '2:00:00'

folder_base = '/hpcgpfs01/scratch/yhuang2/UChicago/transformer/'

script_gen = f'{folder_base}/generate_model.py'
script_run = f'{folder_base}/run_model.py'

checkpoint_folder = Path(f'{folder_base}/checkpoints')
result_folder = Path(f'{folder_base}/results')
runfile_folder = Path(f'{folder_base}/runfiles')

data_folder = Path(f'{folder_base}/datasets/')
data_train = sorted(data_folder.glob('*train.dat'))
data_test = sorted(data_folder.glob('*test.dat'))

In [4]:
config = {
    'account': 'mlg-core',
    'partition': 'volta',
    'nodes': 1,
    'ntasks': 1,
    'gres': 'gpu:1',
    'time': time,
    # 'job-name': job_name,
    # 'output': out_name,
    # 'error': err_name
}

In [5]:
record = []
sbc_files = []
for i, (train_file, test_file) in enumerate(zip(data_train, data_test)):
    prefix = train_file.stem.replace('_train', '')
    record.append([i, prefix])
    model_name = checkpoint_folder/prefix
    output_name = result_folder/f'{prefix}_out.dat'
    cmd_gen = f'python {script_gen} {model_name} {train_file} {epoch}'
    cmd_run = f'python {script_run} {model_name} {test_file} {output_name}'
    
    cmds = [cmd_gen, cmd_run]
    
    job = f'TransEQ_{str(i).zfill(2)}'
    config['job-name'] = job
    config['output'] = f'{job}.out'
    config['error'] = f'{job}.err'
    
    sbc_file = runfile_folder/f'{job}.sbc'
    make_sbc(config, sbc_file, cmds)
    sbc_files.append(sbc_file)
    
df = pd.DataFrame(record, columns=['no.', 'dataset'])
df.to_csv(result_folder/'dataset.csv', index=False)

In [10]:
batch_size = 15
num_batches = (len(data_train) - 1) // 15 + 1

for i in range(num_batches): 
    start = i * batch_size
    end = min((i + 1) * batch_size, len(sbc_files))
    str_start = str(start).zfill(2)
    str_end = str(end - 1).zfill(2)
    
    lauch_file = f'{runfile_folder}/launch_{str_start}-{str_end}.sh'
    print(lauch_file)
    with open(lauch_file, 'w') as fh:
        fh.write('#!/bin/bash\n')
        for sbc_file in sbc_files[start: end]:
            fh.write(f'sbatch {sbc_file}\n')

/hpcgpfs01/scratch/yhuang2/UChicago/transformer/runfiles/launch_00-14.sh
/hpcgpfs01/scratch/yhuang2/UChicago/transformer/runfiles/launch_15-29.sh
/hpcgpfs01/scratch/yhuang2/UChicago/transformer/runfiles/launch_30-44.sh
/hpcgpfs01/scratch/yhuang2/UChicago/transformer/runfiles/launch_45-59.sh
/hpcgpfs01/scratch/yhuang2/UChicago/transformer/runfiles/launch_60-74.sh
