In [1]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
def split_h5_file(
    input_dir,
    input_file_name,
    output_dir,
    test_size=0.2,
    valid_size=0.2,
    random_state=42,
):
    """
    Splits a single HDF5 file into train, validation, and test sets.

    Args:
        input_file (str): Path to the input HDF5 file.
        output_dir (str): Directory to save the output HDF5 files.
        test_size (float): Proportion of the data to use for the test set.
        valid_size (float): Proportion of the remaining data to use for the validation set.
        random_state (int): Random seed for reproducibility.
    """
    # Open the input HDF5 file
    input_file = f"{input_dir}/{input_file_name}.h5"
    with h5py.File(input_file, "r") as f:
        X = f["X"][:]
        Y = f["Y"][:]

    # Split into train + temp (valid + test)
    X_train, X_temp, Y_train, Y_temp = train_test_split(
        X, Y, test_size=(test_size + valid_size), random_state=random_state
    )

    # Split temp into validation and test sets
    valid_ratio = valid_size / (
        test_size + valid_size
    )  # Adjust validation size relative to temp
    X_valid, X_test, Y_valid, Y_test = train_test_split(
        X_temp, Y_temp, test_size=(1 - valid_ratio), random_state=random_state
    )

    # Save the splits into separate HDF5 files
    train_file = f"{output_dir}/{input_file_name}_train.h5"
    valid_file = f"{output_dir}/{input_file_name}_valid.h5"
    test_file = f"{output_dir}/{input_file_name}_test.h5"

    with h5py.File(train_file, "w") as f_train:
        f_train.create_dataset("X", data=X_train)
        f_train.create_dataset("Y", data=Y_train)

    with h5py.File(valid_file, "w") as f_valid:
        f_valid.create_dataset("X", data=X_valid)
        f_valid.create_dataset("Y", data=Y_valid)

    with h5py.File(test_file, "w") as f_test:
        f_test.create_dataset("X", data=X_test)
        f_test.create_dataset("Y", data=Y_test)

    print(
        f"Data successfully split and saved to:\n  Train: {train_file}\n  Validation: {valid_file}\n  Test: {test_file}"
    )

In [3]:
input_dir = (
    "/ceph/cms/store/user/dprimosc/l1deepmet_data/25May15_140X_v0/140X_v0_train/"
)
input_file_name = "perfNano_TT_PU200"
output_dir = input_dir  # Replace with your desired output directory
split_h5_file(input_dir, input_file_name, output_dir)

Data successfully split and saved to:
  Train: /ceph/cms/store/user/dprimosc/l1deepmet_data/25May15_140X_v0/140X_v0_train//perfNano_TT_PU200_train.h5
  Validation: /ceph/cms/store/user/dprimosc/l1deepmet_data/25May15_140X_v0/140X_v0_train//perfNano_TT_PU200_valid.h5
  Test: /ceph/cms/store/user/dprimosc/l1deepmet_data/25May15_140X_v0/140X_v0_train//perfNano_TT_PU200_test.h5
