# Extracting images from HDF5 file

This notebook extracts images from an HDF5 file and organizes them into subfolders as follows:

```
* split
    * slide
        * tile
```

For example:

```
* test
    * TCGA-05-4382-01Z-00-DX1
        * 2_7.jpeg
```

## Train 250k subset

In [8]:
import h5py
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_train.h5"
output_folder = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/train_250k"
os.makedirs(output_folder, exist_ok=True)

with h5py.File(file_path, 'r') as f:
    print("Keys in HDF5 file:", list(f.keys()))

    img_data = f['img']  # actual image tiles
    tiles = [t.decode() if isinstance(t, bytes) else t for t in f['tiles'][:]]   # filenames
    slides = [s.decode() if isinstance(s, bytes) else s for s in f['slides'][:]]  # slide folders

    # group tile indices by slide
    slide_to_indices = defaultdict(list)
    for i, slide in enumerate(slides):
        slide_to_indices[slide].append(i)

    print(f"Found {len(slide_to_indices)} unique slides.")

    for slide_id, indices in tqdm(slide_to_indices.items(), desc="Saving tiles by slide"):
        slide_folder = os.path.join(output_folder, slide_id)
        os.makedirs(slide_folder, exist_ok=True)

        for idx in indices:
            tile_image = img_data[idx]
            tile_filename = tiles[idx]

            image = Image.fromarray(tile_image.astype(np.uint8))
            image.save(os.path.join(slide_folder, tile_filename))


Keys in HDF5 file: ['img', 'labels', 'patterns', 'slides', 'tiles']
Found 678 unique slides.


Saving tiles by slide: 100%|██████████| 678/678 [12:32<00:00,  1.11s/it]


## Full test set

In [2]:
import h5py
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_test-001.h5"
output_folder = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/test"
os.makedirs(output_folder, exist_ok=True)

with h5py.File(file_path, 'r') as f:
    print("Keys in HDF5 file:", list(f.keys()))

    img_data = f['test_img']  # actual image tiles
    tiles = [t.decode() if isinstance(t, bytes) else t for t in f['test_tiles'][:]]   # filenames
    slides = [s.decode() if isinstance(s, bytes) else s for s in f['test_slides'][:]]  # slide folders

    # group tile indices by slide
    slide_to_indices = defaultdict(list)
    for i, slide in enumerate(slides):
        slide_to_indices[slide].append(i)

    print(f"Found {len(slide_to_indices)} unique slides.")

    for slide_id, indices in tqdm(slide_to_indices.items(), desc="Saving tiles by slide"):
        slide_folder = os.path.join(output_folder, slide_id)
        os.makedirs(slide_folder, exist_ok=True)

        for idx in indices:
            tile_image = img_data[idx]
            tile_filename = tiles[idx]

            image = Image.fromarray(tile_image.astype(np.uint8))
            image.save(os.path.join(slide_folder, tile_filename))


Keys in HDF5 file: ['test_img', 'test_labels', 'test_patterns', 'test_slides', 'test_tiles']
Found 186 unique slides.


Saving tiles by slide: 100%|██████████| 186/186 [05:35<00:00,  1.80s/it]


## Full validation set

In [3]:
import h5py
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_validation-003.h5"
output_folder = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/val"
os.makedirs(output_folder, exist_ok=True)

with h5py.File(file_path, 'r') as f:
    print("Keys in HDF5 file:", list(f.keys()))

    img_data = f['valid_img']  # actual image tiles
    tiles = [t.decode() if isinstance(t, bytes) else t for t in f['valid_tiles'][:]]   # filenames
    slides = [s.decode() if isinstance(s, bytes) else s for s in f['valid_slides'][:]]  # slide folders

    # group tile indices by slide
    slide_to_indices = defaultdict(list)
    for i, slide in enumerate(slides):
        slide_to_indices[slide].append(i)

    print(f"Found {len(slide_to_indices)} unique slides.")

    for slide_id, indices in tqdm(slide_to_indices.items(), desc="Saving tiles by slide"):
        slide_folder = os.path.join(output_folder, slide_id)
        os.makedirs(slide_folder, exist_ok=True)

        for idx in indices:
            tile_image = img_data[idx]
            tile_filename = tiles[idx]

            image = Image.fromarray(tile_image.astype(np.uint8))
            image.save(os.path.join(slide_folder, tile_filename))


Keys in HDF5 file: ['valid_img', 'valid_labels', 'valid_patterns', 'valid_slides', 'valid_tiles']
Found 178 unique slides.


Saving tiles by slide: 100%|██████████| 178/178 [05:49<00:00,  1.96s/it]


## Full train set

In [4]:
import h5py
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
from collections import defaultdict

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_train-002.h5"
output_folder = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/train"
os.makedirs(output_folder, exist_ok=True)

with h5py.File(file_path, 'r') as f:
    print("Keys in HDF5 file:", list(f.keys()))

    img_data = f['train_img']  # actual image tiles
    tiles = [t.decode() if isinstance(t, bytes) else t for t in f['train_tiles'][:]]   # filenames
    slides = [s.decode() if isinstance(s, bytes) else s for s in f['train_slides'][:]]  # slide folders

    # group tile indices by slide
    slide_to_indices = defaultdict(list)
    for i, slide in enumerate(slides):
        slide_to_indices[slide].append(i)

    print(f"Found {len(slide_to_indices)} unique slides.")

    for slide_id, indices in tqdm(slide_to_indices.items(), desc="Saving tiles by slide"):
        slide_folder = os.path.join(output_folder, slide_id)
        os.makedirs(slide_folder, exist_ok=True)

        for idx in indices:
            tile_image = img_data[idx]
            tile_filename = tiles[idx]

            image = Image.fromarray(tile_image.astype(np.uint8))
            image.save(os.path.join(slide_folder, tile_filename))


Keys in HDF5 file: ['train_img', 'train_labels', 'train_patterns', 'train_slides', 'train_tiles']
Found 678 unique slides.


Saving tiles by slide: 100%|██████████| 678/678 [23:36<00:00,  2.09s/it]
