In [1]:
from glob import glob
import tifffile
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
import gc
import time
from itertools import product
import h5py

In [2]:
dir_clipped = "/kaggle/working/dataset/stack_clipped"
size_xy = 256
offset_xy = 128
size_z = 6
offset_z = 6
save_dir = f"/kaggle/working/dataset/cropped_xy_{size_xy}_{offset_xy}_z_{size_z}_{offset_z}/"

In [3]:
def pad(shape, size, offset):
    while (shape - size) % offset != 0:
        shape += 1
    return shape


def pad_npy(npy, size_xy, size_z, offset_xy, offset_z):
    shape_x, shape_y, shape_z = npy.shape
    shape_padded_x = pad(shape_x, size_xy, offset_xy)
    shape_padded_y = pad(shape_y, size_xy, offset_xy)
    shape_padded_z = pad(shape_z, size_z, offset_z)
    # npy_padded = np.zeros((shape_padded_x, shape_padded_y, shape_padded_z), dtype=np.float32)
    # npy_padded[:shape_x, :shape_y, :shape_z] = npy
    # return npy_padded
    pad_x = shape_padded_x - shape_x
    pad_y = shape_padded_y - shape_y
    pad_z = shape_padded_z - shape_z

    npy = np.pad(npy, ((0, pad_x), (0, pad_y), (0, pad_z)), mode="constant")
    return npy


def save_fname(image_cropped, label_cropped, save_dir, kidney, coords):
    x_, y_, z_ = coords

    image_std = str(int(image_cropped.std() * 1000)).zfill(4)
    label_sum = str(int(label_cropped.sum()))

    save_fname = f"x{x_}_y{y_}_z{z_}_std{image_std}_sum{label_sum}.npy"

    image_save_path = f"{save_dir}/image/{kidney}/{save_fname}"
    label_save_path = f"{save_dir}/label/{kidney}/{save_fname}"

    return image_save_path, label_save_path


def crop_and_save(image, label, coords, size_xy, size_z, kidney, save_dir):
    x_, y_, z_ = coords
    image_cropped = image[x_ : (x_ + size_xy), y_ : (y_ + size_xy), z_ : (z_ + size_z)]
    label_cropped = label[x_ : (x_ + size_xy), y_ : (y_ + size_xy), z_ : (z_ + size_z)]

    image_save_path, label_save_path = save_fname(image_cropped, label_cropped, save_dir, kidney, coords)
    np.save(image_save_path, image_cropped)
    np.save(label_save_path, label_cropped)

In [4]:
kidneys = ["kidney_1_dense", "kidney_2", "kidney_3_sparse", "kidney_3_dense"]
for kidney in kidneys:
    image_save_dir = f"{save_dir}/image/{kidney}/"
    label_save_dir = f"{save_dir}/label/{kidney}/"

    if os.path.exists(f"{save_dir}/image/{kidney}/"):
        continue

    image_path = f"{dir_clipped}/{kidney}_images.npy"
    image = np.load(image_path)
    image = pad_npy(image, size_xy, size_z, offset_xy, offset_z).astype(np.float32)

    label_path = f"{dir_clipped}/{kidney}_labels.npy"
    label = np.load(label_path)
    label = pad_npy(label, size_xy, size_z, offset_xy, offset_z).astype(np.bool_)

    shape_x, shape_y, shape_z = image.shape
    print(kidney, image.shape)

    os.makedirs(image_save_dir, exist_ok=True)
    os.makedirs(label_save_dir, exist_ok=True)

    iter = list(product(range(0, shape_x, offset_xy), range(0, shape_y, offset_xy), range(0, shape_z, offset_z)))
    for coords in tqdm(iter, total=len(iter)):
        x_, y_, z_ = coords

        if (x_ + size_xy > shape_x) or (y_ + size_xy > shape_y) or (z_ + size_z > shape_z):
            continue

        crop_and_save(image, label, coords, size_xy, size_z, kidney, save_dir)

    del image, label
    gc.collect()
    time.sleep(60)

kidney_1_dense (2304, 1408, 912)


100%|██████████| 30096/30096 [23:22<00:00, 21.47it/s] 


kidney_2 (2304, 1152, 1512)


100%|██████████| 40824/40824 [31:45<00:00, 21.43it/s]  


kidney_3_sparse (512, 1792, 1512)


100%|██████████| 14112/14112 [09:14<00:00, 25.45it/s]


kidney_3_dense (512, 1792, 1512)


100%|██████████| 14112/14112 [09:19<00:00, 25.24it/s]


In [None]:
hdf_path = f"{save_dir}/dataset.hdf5"
if os.path.exists(hdf_path):
    os.remove(hdf_path)
f = h5py.File(hdf_path, mode="w")
group = f.create_group("/data")

file_list = glob(f"{save_dir}/*/*/*.npy")

In [21]:
for file in tqdm(sorted(file_list)):
    arr = np.load(file)
    dataset = group.create_dataset(name=file, shape=arr.shape, dtype=arr.dtype)
    dataset[...] = arr
f.close()

100%|██████████| 159536/159536 [28:19<00:00, 93.85it/s] 
