In [1]:
from pathlib import Path

import h5py
import numpy as np

In [2]:
# Open the HDF5 file in read mode
hdf5_path = "/home/valentin/workspaces/histomil/data/processed/embeddings/superpixels_moco_org.h5"
hdf5_file = h5py.File(hdf5_path, "r")

# List the groups (train/test) and their contents
def print_h5_structure(group, indent=0):
    """Recursively prints the structure of an HDF5 file."""
    for key in group.keys():
        print("  " * indent + f"📂 {key}")
        if isinstance(group[key], h5py.Group):
            print_h5_structure(group[key], indent + 1)

print_h5_structure(hdf5_file)

📂 test
  📂 C3L-00001-21
    📂 embeddings
    📂 tile_ids
  📂 C3L-00001-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00081-21
    📂 embeddings
    📂 tile_ids
  📂 C3L-00081-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00093-21
    📂 embeddings
    📂 tile_ids
  📂 C3L-00095-22
    📂 embeddings
    📂 tile_ids
  📂 C3L-00140-21
    📂 embeddings
    📂 tile_ids
  📂 C3L-00368-21
    📂 embeddings
    📂 tile_ids
  📂 C3L-00412-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00445-22
    📂 embeddings
    📂 tile_ids
  📂 C3L-00445-25
    📂 embeddings
    📂 tile_ids
  📂 C3L-00445-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00446-22
    📂 embeddings
    📂 tile_ids
  📂 C3L-00446-23
    📂 embeddings
    📂 tile_ids
  📂 C3L-00446-24
    📂 embeddings
    📂 tile_ids
  📂 C3L-00446-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00568-26
    📂 embeddings
    📂 tile_ids
  📂 C3L-00603-22
    📂 embeddings
    📂 tile_ids
  📂 C3L-00604-22
    📂 embeddings
    📂 tile_ids
  📂 C3L-00604-23
    📂 embeddings
    📂 tile_ids
  📂 C3L-00893

In [3]:
total_tiles = 0
with h5py.File(hdf5_path, "r") as f:
    # Loop over split groups (e.g., "train" and "test")
    for split in f.keys():
        split_group = f[split]
        for wsi in split_group.keys():
            # Assumes that each WSI group has an "embeddings" dataset
            ds = split_group[wsi]["embeddings"]
            total_tiles += ds.shape[0]
print("Total number of tiles (computed):", total_tiles)

Total number of tiles (computed): 4063887


In [4]:
with h5py.File(hdf5_path, "r") as f:
    total_tiles = f.attrs["total_tiles"]
    print("Total number of tiles (from metadata):", total_tiles)

Total number of tiles (from metadata): 4063887


In [5]:
# Access embeddings of a specific WSI
wsi_id = "C3N-05915-29"
split = "train"

embeddings = hdf5_file[f"{split}/{wsi_id}/embeddings"][:]
tile_ids = hdf5_file[f"{split}/{wsi_id}/tile_ids"][:]
label = hdf5_file[f"{split}/{wsi_id}"].attrs["label"]

print(f"WSI ID: {wsi_id}")
print(f"Label: {label}")
print(f"Embeddings shape: {embeddings.shape}")
print(f"tile ids length: {tile_ids.shape}")

WSI ID: C3N-05915-29
Label: NORMAL
Embeddings shape: (159, 2048)
tile ids length: (159,)


In [6]:
tile_ids

array([b'C3N-05915-29__x34048_y7168', b'C3N-05915-29__x29568_y8512',
       b'C3N-05915-29__x3584_y12096', b'C3N-05915-29__x33600_y5824',
       b'C3N-05915-29__x6720_y11200', b'C3N-05915-29__x31808_y7168',
       b'C3N-05915-29__x3136_y12096', b'C3N-05915-29__x4928_y13888',
       b'C3N-05915-29__x34496_y7616', b'C3N-05915-29__x4480_y13440',
       b'C3N-05915-29__x4480_y14336', b'C3N-05915-29__x6720_y10752',
       b'C3N-05915-29__x32704_y8960', b'C3N-05915-29__x32256_y4032',
       b'C3N-05915-29__x7168_y10304', b'C3N-05915-29__x4928_y12992',
       b'C3N-05915-29__x32704_y7616', b'C3N-05915-29__x31808_y4928',
       b'C3N-05915-29__x4032_y10752', b'C3N-05915-29__x5824_y13888',
       b'C3N-05915-29__x32704_y4928', b'C3N-05915-29__x30912_y4480',
       b'C3N-05915-29__x4032_y12992', b'C3N-05915-29__x5376_y14336',
       b'C3N-05915-29__x7168_y9856', b'C3N-05915-29__x4032_y12544',
       b'C3N-05915-29__x2688_y11648', b'C3N-05915-29__x30016_y6720',
       b'C3N-05915-29__x31360_y8064

In [7]:
wsi_ids = list(hdf5_file["train"].keys())

In [8]:
wsi_ids

['C3L-00009-21',
 'C3L-00009-26',
 'C3L-00080-21',
 'C3L-00080-26',
 'C3L-00083-21',
 'C3L-00083-26',
 'C3L-00093-26',
 'C3L-00094-21',
 'C3L-00094-26',
 'C3L-00095-21',
 'C3L-00095-23',
 'C3L-00095-26',
 'C3L-00140-22',
 'C3L-00140-26',
 'C3L-00144-21',
 'C3L-00144-26',
 'C3L-00368-22',
 'C3L-00368-26',
 'C3L-00412-21',
 'C3L-00415-21',
 'C3L-00415-23',
 'C3L-00415-26',
 'C3L-00444-21',
 'C3L-00444-22',
 'C3L-00444-23',
 'C3L-00444-26',
 'C3L-00444-27',
 'C3L-00444-28',
 'C3L-00445-21',
 'C3L-00445-23',
 'C3L-00445-24',
 'C3L-00446-21',
 'C3L-00446-25',
 'C3L-00446-27',
 'C3L-00568-21',
 'C3L-00568-22',
 'C3L-00568-23',
 'C3L-00603-21',
 'C3L-00603-26',
 'C3L-00604-21',
 'C3L-00604-26',
 'C3L-00893-21',
 'C3L-00893-22',
 'C3L-00893-23',
 'C3L-00893-24',
 'C3L-00893-26',
 'C3L-00904-21',
 'C3L-00904-22',
 'C3L-00904-26',
 'C3L-00913-21',
 'C3L-00913-22',
 'C3L-00913-23',
 'C3L-00913-26',
 'C3L-00923-26',
 'C3L-00927-21',
 'C3L-00927-22',
 'C3L-00927-23',
 'C3L-00927-26',
 'C3L-00965-21

In [14]:
wsi_ids_few_tiles = [
    wsi_id
    for wsi_id in wsi_ids_few_tiles
    if hdf5_file[f"train/{wsi_id}/embeddings"][:].shape[0] < 9
]

In [15]:
print(wsi_ids_few_tiles)

['C3L-02665-27', 'C3L-04378-28', 'C3N-04169-29', 'C3N-04176-29', 'C3N-04673-24', 'C3N-04673-29']


In [11]:
len(wsi_ids_few_tiles)

56