In [6]:
import h5py
import numpy as np
import os
from PIL import Image


In [7]:

def store_many_hdf5(images, labels, filename):
    """ Stores an array of images to HDF5.
        Parameters:
        ---------------
        images       images array, 
        labels       labels array,
    """
    num_images = len(images)

    # Create a new HDF5 file
    file = h5py.File( f"generated/{filename}", "w")

    # Create a dataset in the file
    dataset = file.create_dataset(
        "images", np.shape(images), h5py.h5t.STD_U8BE, data=images
    )
    meta_set = file.create_dataset(
        "meta", np.shape(labels), h5py.h5t.STD_U8BE, data=labels
    )
    file.close() 

In [12]:
def generate_many(amount, filename):
    """ Copy N time an image and saves to a HDF5 file, and add a random label
        Parameters:
        ---------------
        amount   number of times to copy
        Returns:
        ----------
        labels      associated meta data, int label (N, 1)
     
    """
    img_path = 'cars.jpeg'
    img_np = np.array(Image.open(img_path)) # images saved as N dimensional array 
    images = np.repeat([img_np], repeats=amount, axis=0)
    labels = np.random.randint(10, size=amount) # labels can be any data. In this case this function saving integers
    store_many_hdf5(images, labels, filename)
    return labels
    

In [32]:
def read_all_hdf5(filename):
    """ Reads image from HDF5.
        Parameters:
        ---------------
        num_images   number of images to read

        Returns:
        ----------
        images      images array 
        labels      associated meta data, int label 
    """
    start = time.time()
    images, labels = [], []

    # Open the HDF5 file
    file = h5py.File(f"generated/{filename}", "r+")

    images = np.array(file["/images"]).astype("uint8")
    labels = np.array(file["/meta"]).astype("uint8")
    file.close()
    fin = time.time()
    print('process time', fin-start)
    return images, labels

In [30]:
import time
def read_hdf5_based_on_label(filename, ids):
    start = time.time()
    """ Reads image from HDF5.
        Parameters:
        ---------------
        num_images   number of images to read

        Returns:
        ----------
        images      images array, (N, 32, 32, 3) to be stored
        labels      associated meta data, int label (N, 1)
    """
    images, labels = [], []

    # Open the HDF5 file
    file = h5py.File(f"generated/{filename}", "r+")
    meta = file["/meta"]
    
    # search the specific indexes
    label_index = meta[()] == ids
    images = file["/images"]

    indexes = [index for index, value in enumerate(label_index) if value ]
    result = [images[index] for index in indexes ]
    np_result = np.array(result).astype("uint8")
    file.close()
    fin = time.time()
    print('process time', fin-start)
    return  np_result

# generate data

In [13]:
generate_many(100, 'output.hdf5')

array([4, 3, 7, 8, 1, 9, 8, 6, 7, 9, 8, 2, 3, 1, 2, 7, 4, 1, 7, 6, 5, 1,
       6, 0, 1, 1, 8, 9, 9, 1, 8, 9, 8, 9, 4, 4, 2, 7, 3, 8, 8, 3, 7, 5,
       5, 0, 7, 5, 1, 8, 6, 8, 4, 2, 7, 3, 9, 9, 8, 2, 5, 0, 1, 8, 8, 6,
       3, 0, 4, 7, 3, 3, 4, 6, 7, 0, 4, 4, 1, 3, 5, 9, 9, 2, 3, 5, 7, 0,
       9, 6, 2, 6, 2, 6, 0, 2, 4, 0, 4, 2])

# read example

In [33]:
read_all_hdf5('output.hdf5')[0].size

process time 0.6953904628753662


15103200

In [29]:
data = read_hdf5_based_on_label('output.hdf5',1)
print('number of images' , len(data)) 

print('one image np array representation:', data[:1]) 

0.21500349044799805
number of images 10
one image np array representation: [[[[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  ...

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]

  [[255 255 255]
   [255 255 255]
   [255 255 255]
   ...
   [255 255 255]
   [255 255 255]
   [255 255 255]]]]


Here clearly an index based read is faster