# Transformer job submissions

19th May 2022 - Notebook for submitting transformer training jobs on predicting spacegroup numbers / strings.

In [3]:
from typing import List

def write_slurm_script(job_name: str,
                       run_time: str,
                       output_name: str,
                       script: str,
                       file_name: str,
                       args: List = None,
                       package_dir: str = None,
                       email: bool = False,
                       gpu: bool = False,
                       conda_env: str = 'ampere'):

    if gpu:
        slurm_options = [
            '#!/bin/bash',
            f'#SBATCH -J {job_name}',
            '#SBATCH -A LEE-WJM41-SL2-GPU',
            '#SBATCH --nodes=1',
            '#SBATCH --ntasks=1',
            '#SBATCH --gres=gpu:1',
            f'#SBATCH --time={run_time}',
            '#SBATCH --mail-user=wjm41@cam.ac.uk',
            f'#SBATCH --output={output_name}',
            '#SBATCH -p ampere',
        ]
    else:
        slurm_options = [
            '#!/bin/bash',
            f'#SBATCH -J {job_name}',
            '#SBATCH -A LEE-WJM41-SL2-CPU',
            '#SBATCH --nodes=1',
            '#SBATCH --ntasks=1',
            ' #SBATCH --cpus-per-task=1',
            f'#SBATCH --time={run_time}',
            '#SBATCH --mail-user=wjm41@cam.ac.uk',
            f'#SBATCH --output={output_name}',
            '#SBATCH -p icelake-himem',
        ]
    if email:
        slurm_options.append('#SBATCH --mail-type=ALL')

    if gpu:
        module_options = [
            '. /etc/profile.d/modules.sh',
            'module purge',
            'module load rhel8/default-amp',
            'module load miniconda/3',
            f'source activate {conda_env}',
        ]
    else:
        module_options = [
            '. /etc/profile.d/modules.sh',
            'module purge',
            'module load rhel8/default-amp',
            'module load miniconda/3',
            f'source activate {conda_env}',
        ]
    if package_dir is not None:
        pre_empt = f'cd {package_dir}; pip install . --use-feature=in-tree-build'
    else:
        pre_empt = ''

    slurm_options = '\n'.join(slurm_options)
    module_options = '\n'.join(module_options)
    if args is not None:
        command_to_run = ' '.join([script]+args)
    else:
        command_to_run = script
        
    string_to_write = f'{slurm_options}\n{module_options}\n{pre_empt}\n{command_to_run}'

    with open(file_name, 'w') as f:
        f.write(string_to_write)

    return


Training Jobs:

In [4]:
import os

def submit_training_job(dataset:str):
    data_dir = f'/home/wjm41/ml_physics/smi2wyk/data/{dataset}'
    log_dir = f'/home/wjm41/ml_physics/smi2wyk/runs/{dataset}'

    preprocess_script = f'onmt_build_vocab -config {data_dir}/preprocess.yaml'
    train_script = f'onmt_train -config {data_dir}/train_single.yaml -tensorboard True -tensorboard_log_dir {log_dir}'
    script = f'{preprocess_script}\n{train_script}'

    file_name = f'subm_train_{dataset}'
    run_time = '8:00:00'
    current_dir = os.getcwd()
    output_name = f'{current_dir}/{file_name}.out'

    write_slurm_script(job_name=f'{file_name}',
                    run_time=f'{run_time}',
                    output_name=output_name,
                    script=script,
                    file_name=file_name,
                    email=True,
                    conda_env='DebiasedMT',
                    gpu=True
                    )

    print(f"Submitted transformer training jobs on {dataset}")

    !sbatch {file_name}
    return

In [5]:
submit_training_job('smi2spgnum')
submit_training_job('smi2spgstr')
submit_training_job('smi2spgstrtok')


Submitted transformer training jobs on smi2spgnum
Submitted batch job 61398301
Submitted transformer training jobs on smi2spgstr
Submitted batch job 61398302
Submitted transformer training jobs on smi2spgstrtok
Submitted batch job 61398303


Translation Jobs

In [1]:
import os

def submit_translation_job(dataset:str, step:int, beam_size:int = 10):
    data_dir = f'/home/wjm41/ml_physics/smi2wyk/data/{dataset}'

    script_dir = '/home/wjm41/ml_physics/smi2wyk/smi2wyk'
    model_dir = f'/rds-d2/user/wjm41/hpc-work/models/smi2wyk'
    model_path = f'{model_dir}/{dataset}/model_step_{step}.pt'
    pred_name = f'{data_dir}/pred_step_{step}.txt'  
    
    translate_script = f'onmt_translate -model {model_path} -src {data_dir}/src-test.csv -output {pred_name} -n_best {beam_size} -beam_size {beam_size} -gpu 0'
    score_script = f'python {script_dir}/score_predictions.py -targets {data_dir}/tgt-test.csv -beam_size {beam_size} -predictions {pred_name}'
    script = f'{translate_script}\n{score_script}'

    file_name = f'subm_test_{dataset}_{step}'
    run_time = '1:00:00'
    current_dir = os.getcwd()
    output_name = f'{current_dir}/{file_name}.out'

    write_slurm_script(job_name=f'{file_name}',
                    run_time=f'{run_time}',
                    output_name=output_name,
                    script=script,
                    file_name=file_name,
                    email=True,
                    conda_env='DebiasedMT',
                    gpu=True
                    )

    print(f"Submitted translation & scoring jobs on {dataset}")

    !sbatch {file_name}
    return

In [4]:
step_num = 57500
submit_translation_job('smi2spgnum', step=step_num)
submit_translation_job('smi2spgstr', step=step_num)
submit_translation_job('smi2spgstrtok', step=step_num)


Submitted translation & scoring jobs on smi2spgnum
Submitted batch job 61403365
Submitted translation & scoring jobs on smi2spgstr
Submitted batch job 61403368
Submitted translation & scoring jobs on smi2spgstrtok
Submitted batch job 61403370
