In [0]:
# configuration
import os
raw_data_path = dbutils.widgets.get("raw_data_path")
processed_features_path = dbutils.widgets.get("processed_features_path")
BATCH_SIZE = int(dbutils.widgets.get("batch_size"))

os.makedirs(processed_features_path, exist_ok=True)

# identify which subjects need to be processed
raw_files = [f for f in os.listdir(raw_data_path) if f.endswith(".parquet")]
available_subject_ids = {f.split("_")[1] for f in raw_files}

processed_folders = [d for d in os.listdir(processed_features_path)]
processed_subject_ids = {d for d in processed_folders}

subjects_to_process = sorted(list(available_subject_ids - processed_subject_ids))

display(f"Found {len(subjects_to_process)} subjects to process.")


# now take a batch and trigger the processing notebook for each
subjects_for_this_run = subjects_to_process[:BATCH_SIZE]
display(f"Processing batch of {len(subjects_for_this_run)} subjects: {subjects_for_this_run}")

for sub_id in subjects_for_this_run:
    display(f"--- Triggering job for subject {sub_id} ---")
    try:
        # runs the other notebook as a sub-job
        dbutils.notebook.run(
            path="./02a_process_single_subject",
            timeout_seconds=0,
            arguments={"subject_id": sub_id,
                       "raw_data_path": raw_data_path,
                       "processed_features_path": processed_features_path}
        )
        display(f"--- Job for subject {sub_id} completed successfully ---")
    except Exception as e:
        display(f"--- Job for subject {sub_id} FAILED: {e} ---")