In [1]:
import tables
import os,sys
import glob
import PIL
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn import model_selection
from skimage import io as skio
from skimage.io import imread_collection

In [None]:
# filename
dataname="sclerosis"
patch_size=256 

In [None]:
img_dtype = tables.UInt8Atom()  # dtype in which the images will be saved, this indicates that images will be saved as unsigned int 8 bit, i.e., [0,255]
filenameAtom = tables.StringAtom(itemsize=255) #create an atom to store the filename of the image, just incase we need it later, 

## Training/ validation data setup 

Collect all training slides' patches, then divide into training/validation dataset of hdf5 format(pytable). We can also divide dataset at slide level. 

In [None]:
# read all training slide ids
IDs = os.listdir("./masks_patch")
print(IDs)

['D16', 'D17', 'D18', 'D19']


In [None]:
# collect all patches 
images = []
masks = []
names = []

for idi in IDs[:]:
    print(idi)
    # collect mask patches for slide_idi
    col = np.array(imread_collection(r"./masks_patch/{}/*.png".format(idi)))
    l = len(col)
    if(l==0): #skip empty folders 
        continue
    masks.append(col[:]) 
    # collect slide patches
    col = np.array(imread_collection("./slides_patch/{}/*.png".format(idi)))
    images.append(col[:])
    # collect names
    name1 = os.listdir("./masks_patch/{}".format(idi))
    # make sure names in same order as imread_collection
    name2 = np.array(sorted(name1,key=lambda e: int(e.split('_')[2])))
    names.append(list(name2[:])) 
    
# concate patches from all slides     
col_msk = np.concatenate(masks)
col_img = np.concatenate(images)
col_name = np.concatenate(names)
print("maskpatch_collection",col_msk.shape)
print("slidepatch_collection",col_img.shape)
print("namelist",col_name.shape)

D16
D17
D18
D19
maskpatch_collection (277, 256, 256)
slidepatch_collection (277, 256, 256, 3)
namelist (277,)


In [None]:
# random split training/validation patches at 8:2 ratio
train,val = next(iter(model_selection.ShuffleSplit(n_splits=1,test_size=0.2).split(col_msk)))
print("num of train patch:",len(train),"\nnum of val patch:",len(val))

num of train patch: 221 
num of val patch: 56


In [None]:
# prepare for store in hdf5 file
img={}
msk={}
name={}
img["train"] = col_img[train]
img["val"] = col_img[val]
msk["train"] = col_msk[train]
msk["val"] = col_msk[val]
name["val"] = col_name[val]
name["train"] = col_name[train]
print("img_train",img["train"].shape)
print("msk_train",msk["train"].shape)
print("name_train",name["train"].shape)

img_train (221, 256, 256, 3)
msk_train (221, 256, 256)
name_train (221,)


In [None]:
#setup hdf5 file
storage={} #holder for future pytables
imgtypes=["img","msk"]
patch_size = 256
block_shape={} #block shape specifies what we'll be saving into the pytable array, here we assume that masks are 1d and images are 3d
block_shape["img"]= np.array((patch_size,patch_size,3))
block_shape["msk"]= np.array((patch_size,patch_size)) 
filters=tables.Filters(complevel=6, complib='zlib') #we can also specify filters, such as compression, to improve storage speed

In [None]:
# fill in hdf5 file of training/validtion datasets
phases = ["train","val"]
for phase in phases: #now for each of the phases, we'll loop through the files
    
    print("generate hdf5 file ",dataname+'_'+phase+'.pytable')

    hdf5_file = tables.open_file(f"../data/{dataname}_{phase}.pytable", mode='w') #open the respective pytable, here we choose to store in training data dir
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', filenameAtom, (0,)) #create the array for storage
    storage["filename"].append(name[phase]) #add the filename to the storage array
    
    for imgtype in imgtypes: #for each of the image types, in this case mask and image, we need to create the associated earray
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)        
        #save the 4D tensor to the table
        if(imgtype=="img"):
            storage[imgtype].append(img[phase])
        elif(imgtype=="msk"):
            storage[imgtype].append(msk[phase])

    hdf5_file.close()

generate hdf5 file  sclerosis_train.pytable
generate hdf5 file  sclerosis_val.pytable


## Testing data setup 

Same as training data setup, just using testing slides as input with filename "{dataname}_test" 

In [None]:
images = []
masks = []
names = []

# get testing slides IDs 
for idi in IDs[:]:
    print(idi)
    col = np.array(imread_collection(r"./masks_patch/{}/*.png".format(idi)))
    l = len(col)
    if(l==0):
        continue
    masks.append(col[:])
    col = np.array(imread_collection("./slides_patch/{}/*.png".format(idi)))
    images.append(col[:])
    name1 = os.listdir("./masks_patch/{}".format(idi))
    name2 = np.array(sorted(name1,key=lambda e: int(e.split('_')[2])))
    names.append(list(name2[:])) 
    
col_msk = np.concatenate(masks)
col_img = np.concatenate(images)
col_name = np.concatenate(names)

In [None]:
# setup hdf5 file
img={}
msk={}
name={}
img["test"] = col_img[:]
msk["test"] = col_msk[:]
name["test"] = col_name[:]

storage={} 
imgtypes=["img","msk"]
patch_size = 256
block_shape={} 
block_shape["img"]= np.array((patch_size,patch_size,3))
block_shape["msk"]= np.array((patch_size,patch_size)) 
filters=tables.Filters(complevel=6, complib='zlib') 

phases = ["test"] # phase change to testing

for phase in phases:
    print(phase)

    hdf5_file = tables.open_file(f"E:/PATH FINAL/{dataname}_{phase}.pytable", mode='w') 
    storage["filename"] = hdf5_file.create_earray(hdf5_file.root, 'filename', filenameAtom, (0,)) 
    storage["filename"].append(name[phase]) 
    print(len(storage["filename"]))
    for imgtype in imgtypes: 
        storage[imgtype]= hdf5_file.create_earray(hdf5_file.root, imgtype, img_dtype,  
                                                  shape=np.append([0],block_shape[imgtype]), 
                                                  chunkshape=np.append([1],block_shape[imgtype]),
                                                  filters=filters)        
        if(imgtype=="img"):
            storage[imgtype].append(img[phase])
        elif(imgtype=="msk"):
            storage[imgtype].append(msk[phase])
    hdf5_file.close()

reference<br>
https://github.com/choosehappy/PytorchDigitalPathology/blob/master/segmentation_epistroma_unet/make_hdf5.ipynb
http://machinelearninguru.com/deep_learning/data_preparation/hdf5/hdf5.html