In [1]:
import h5py
import numpy as np
import os

In [2]:
def pad_dataset(data, target_shape, name):
    pad_neg_one = ['b1', 'b2', 'mask', 'bb']
    data_type = data.dtype
    if name in pad_neg_one:
        print("Padding with -1")
        padded_data = np.full(target_shape, -1, dtype=data_type) 
    else:
        padded_data = np.zeros(target_shape, dtype=data_type)
    padded_data[:, :data.shape[1]] = data
    # print(padded_data.dtype)
    return padded_data

In [3]:
# Recursively copy groups and datasets from the first file
def copy_group(src_group, dest_group):
    for name, obj in src_group.items():
        if isinstance(obj, h5py.Group):
            if name in dest_group:
                # Group already exists, recursively copy its content
                copy_group(obj, dest_group[name])
            else:
                # Group doesn't exist, create a new one and copy content
                new_group = dest_group.create_group(name)
                copy_group(obj, new_group)
                # print(name)
        elif isinstance(obj, h5py.Dataset):
            if name in dest_group:
                # Dataset already exists, add data to the existing dataset
                existing_dataset = dest_group[name]
                src_data = obj[...]
                existing_data = existing_dataset[...]
                # Pad the datasets if needed
                if len(existing_data.shape) == 2 and existing_data.shape[1] != src_data.shape[1]:
                    max_cols = max(existing_data.shape[1], src_data.shape[1])
                    padded_existing_data = pad_dataset(existing_data, (existing_data.shape[0], max_cols), name)
                    padded_src_data = pad_dataset(src_data, (src_data.shape[0], max_cols), name)
                    merged_dataset = np.concatenate((padded_existing_data, padded_src_data), axis=0)
                else:
                    merged_dataset = np.concatenate((existing_data, src_data), axis=0)
                # print(name)
                del dest_group[name]  # Delete existing dataset
                dest_group.create_dataset(name, data=merged_dataset)
                # print(dest_group[name].shape)
            else:
                # Dataset doesn't exist, create a new one and copy data
                src_group.copy(obj, dest_group)
                print(obj)

In [7]:
def merge_multiple_h5_files(h5_files, merged_file):
    # Open the first file to merge others into it
    with h5py.File(merged_file, 'w') as merged_f:
        for i, file in enumerate(h5_files):
            if "p5" not in file and "SM" not in file:
                with h5py.File(file, 'r') as f:
                    print(f"Merging {file} into {merged_file}...")
                    copy_group(f, merged_f)  # Recursively copy groups and datasets

In [8]:
def get_h5_files_from_directory(directory):
    # Get a list of all .h5 files in the specified directory
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.h5')]

In [10]:
directory = 'h5_merged/h5_files_test'
h5_files = get_h5_files_from_directory(directory)
merged_file = 'h5_merged/hhh_eq_mass_points_test.h5'
merge_multiple_h5_files(h5_files, merged_file)

Merging h5_merged/h5_files_test/hhh_test_90.h5 into h5_merged/hhh_eq_mass_points_test.h5...
<HDF5 dataset "MASK": shape (246198, 3), type "|b1">
<HDF5 dataset "fj_charge": shape (246198, 3), type "<i8">
<HDF5 dataset "fj_chargedenergyfrac": shape (246198, 3), type "<f8">
<HDF5 dataset "fj_cosphi": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_ehadovereem": shape (246198, 3), type "<f8">
<HDF5 dataset "fj_eta": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_mass": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_ncharged": shape (246198, 3), type "<i8">
<HDF5 dataset "fj_neutralenergyfrac": shape (246198, 3), type "<f8">
<HDF5 dataset "fj_nneutral": shape (246198, 3), type "<i8">
<HDF5 dataset "fj_phi": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_pt": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_sdmass": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_sinphi": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_tau21": shape (246198, 3), type "<f4">
<HDF5 dataset "fj_tau32":

In [7]:
def print_hdf5_structure(obj, indent=0):
    """
    Recursively prints the structure of the HDF5 file.
    """
    spaces = '  ' * indent
    if isinstance(obj, h5py.File):
        print(f"{spaces}{obj.name} (File)")
    elif isinstance(obj, h5py.Group):
        print(f"{spaces}{obj.name} (Group)")
    elif isinstance(obj, h5py.Dataset):
        print(f"{spaces}{obj.name} (Dataset) {obj.shape} {obj.dtype}")
    else:
        print(f"{spaces}{obj.name} (Unknown)")

    if isinstance(obj, (h5py.File, h5py.Group)):
        for key in obj:
            item = obj[key]
            print_hdf5_structure(item, indent+1)

# Replace 'your_file.h5' with the path to your HDF5 file
with h5py.File('h5_merged/h5_merged_all.h5', 'r') as f:
    print_hdf5_structure(f)


/ (File)
  /INPUTS (Group)
    /INPUTS/BoostedJets (Group)
      /INPUTS/BoostedJets/MASK (Dataset) (24810635, 3) bool
      /INPUTS/BoostedJets/fj_charge (Dataset) (24810635, 3) int64
      /INPUTS/BoostedJets/fj_chargedenergyfrac (Dataset) (24810635, 3) float64
      /INPUTS/BoostedJets/fj_cosphi (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_ehadovereem (Dataset) (24810635, 3) float64
      /INPUTS/BoostedJets/fj_eta (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_mass (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_ncharged (Dataset) (24810635, 3) int64
      /INPUTS/BoostedJets/fj_neutralenergyfrac (Dataset) (24810635, 3) float64
      /INPUTS/BoostedJets/fj_nneutral (Dataset) (24810635, 3) int64
      /INPUTS/BoostedJets/fj_phi (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_pt (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_sdmass (Dataset) (24810635, 3) float32
      /INPUTS/BoostedJets/fj_sinphi (Dataset) (24810