In [20]:
import h5py
import numpy as np

def inspect_h5(h5_path, max_print=5):
    """
    Opens an HDF5 file, prints info about each dataset's name, shape, dtype,
    and shows up to `max_print` values from each dataset. Also reports the
    number of datasets and total number of entries.
    """
    print(f"Inspecting file: {h5_path}\n")
    total_entries = 0
    total_datasets = 0
    
    with h5py.File(h5_path, 'r') as f:
        for key in f.keys():
            total_datasets += 1
            dset = f[key]
            shape = dset.shape
            dtype = dset.dtype
            num_entries = np.prod(shape) if shape else 0
            total_entries += num_entries
            
            print(f"Dataset: '{key}'")
            print(f"  - shape: {shape}")
            print(f"  - dtype: {dtype}")
            print(f"  - number of entries: {num_entries}")

            # Retrieve a small slice of data
            data = dset[...]
            n_to_show = min(max_print, data.shape[0]) if len(data.shape) > 0 else 0
            # Print the first few entries for a quick look
            print("  - sample values:")
            print("    ", data[n_to_show:], "\n")

def count_entries(h5_path, key_to_inspect):
    """
    Opens an HDF5 file and counts the number of entries in a specific dataset key.
    Assumes all keys have the same number of entries.
    
    Args:
        h5_path (str): Path to the HDF5 file.
        key_to_inspect (str): The dataset key to inspect for the number of entries.

    Returns:
        int: Number of entries in the specified dataset.
    """
    with h5py.File(h5_path, 'r') as f:
        if key_to_inspect in f.keys():
            dset = f[key_to_inspect]
            return dset.shape[0]
        else:
            raise KeyError(f"Key '{key_to_inspect}' not found in the HDF5 file.")

In [None]:
dataset_path = '../data/crystallm/full/serialized/train.h5'

print("Number of entries:", count_entries(dataset_path, key_to_inspect='cif_name'))
inspect_h5(dataset_path, max_print=1)

Number of entries: 2169177
Inspecting file: ../data/crystallm/full/serialized/train.h5

Dataset: 'cif_name'
  - shape: (2169177,)
  - dtype: object
  - number of entries: 2169177


In [None]:
dataset_path = '../data/combined/full/serialized/train.h5'

print("Number of entries:", count_entries(dataset_path, key_to_inspect='cif_name'))
inspect_h5(dataset_path, max_print=1)