# Tutorial for running many small independent jobs in parallel

Let's test it out. First, let's create some data:

In [1]:
# Create some dummy data

import numpy as np
import pickle as pkl
import pandas as pd

rng = np.random.default_rng(12345)

N = 100
d = 2
data = list(rng.normal(size=(N, d)))

with open('hpc/run_1/data.pkl', 'wb') as f:
    pkl.dump(data, f)

# with open('hpc/run_1/data.txt', 'w') as f:
#     for x in data:
#         f.write(' '.join(map(str, x)) + '\n')
pd.DataFrame(data).to_csv('hpc/run_1/data.csv', header=False, index=False)

# read data
with open('hpc/run_1/data.txt', 'r') as f:
    data = [list(map(float, line.strip().split())) for line in f]

ModuleNotFoundError: No module named 'pandas'

Now follow the steps in `README.md` to push the files to the cluster, run computations, and pull the results back to the local machine.

Once this is done, we can check that everything behaved correctly with the following block:

In [2]:
# Test the results

with open('hpc/run_1/results/results.pkl', 'rb') as f:
    results = pkl.load(f)

# As it is, the processing script just returns the input data, so we can simply 
# check that the results are the same as the input data.
for i in range(N):
    assert np.all(results[i][1] == data[i])

print("All tests passed!")

FileNotFoundError: [Errno 2] No such file or directory: 'hpc/run_1/results/results.pkl'

### How large should by job array be? And how many tasks per job?

This depends on the cluster you are using and the resources available to you. Here is a function that can help estimate how long a job will take (assuming the entire job array gets started at the same time):

In [3]:
def estimate_total_time(num_runs, single_run_time, job_array_size, n_tasks_per_job, safety_factor=1.0):
    """Estimates the amount of time a job will take.
    
    Parameters
    ----------
    num_runs : int
        Number of independent runs.
    single_run_time : float
        Time (in seconds) for a single run.
    job_array_size : int
        Size of the job array
    n_tasks_per_job : int
        Number of tasks per job.
    safety_factor : float
    """
    total_time = num_runs*single_run_time/(job_array_size*n_tasks_per_job)*safety_factor
    hours = total_time//3600
    minutes = (total_time - hours*3600)//60
    seconds = total_time - hours*3600 - minutes*60
    print(f'{hours:.0f} hours, {minutes:.0f} minutes, {seconds:.0f} seconds')

So, for instance, if I wanted to do 1,000,000 runs where each run takes 3 minutes, and I want to submit a job array of size 400 and with 50 tasks per subjob, I would do the following:

In [4]:
estimate_total_time(
    1000000,  # number of runs
    180,  # time per run (in seconds)
    400,  # job array size
    50  # number of tasks per job
)

2 hours, 30 minutes, 0 seconds
