In [1]:
# h5py document: http://docs.h5py.org/en/stable/quick.html
import h5py
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm

In [2]:
def read_image(grayscale_path, csv_file_path, hdf5_file_path) -> list:
    name_label = pd.read_csv(csv_file_path, usecols=['PicName', 'Label'])

    # extract_name_label_obj = name_label.groupby('Label')
    group_names = name_label.loc[:, 'Label'].unique() # 'neris', 'rbot', ...
    filename_bygroup = []
    
    # read image names by group
    for group in group_names:
        one_group = name_label[name_label.loc[:, 'Label'].isin([group])]
        one_group_names = np.array(one_group.loc[:, 'PicName'] + '.bmp').tolist()
        filename_bygroup.append(one_group_names)
    
    total = 0
    for ls in filename_bygroup:
        total += len(ls)
    print('Totaly {img_num} images of {type_num} types.'\
         .format(img_num = total, type_num = len(group_names)))
    
    # read images by type
    image_list = []
    for name_order, name_list in enumerate(filename_bygroup):
        one_type_list = []
        for cnt, name in enumerate(tqdm(name_list, \
                                        desc='Reading \'{0}\' images'.format(group_names[name_order]))):
            img = Image.open(grayscale_path + name).convert('L')
            img = np.array(img).reshape(32, 32) / 255.0
            one_type_list.append(img)
        image_list.append(one_type_list)
    label_list = group_names
    
    assert len(label_list) == len(image_list)
    
    return image_list, label_list

In [3]:
def botnet2hdf5(grayscale_path, csv_file, hdf5_file_path):
    if os.path.exists(hdf5_file_path) == False:
        os.mkdir(hdf5_file_path)
        
    image_list, label_list = read_image(grayscale_path, csv_file)
    
    for i in range(len(image_list)):
        with h5py.File(hdf5_file_path + label_list[i] + '.h5', 'w') as hdf:
            hdf.create_dataset('image_array', data=np.array(image_list[i]))

In [None]:
grayscale_path = None
botnet_csv = None
hdf5_file_path = None

botnet2hdf5(grayscale_path, botnet_csv, hdf5_file_path)

Totaly 986708 images of 1 types.


HBox(children=(FloatProgress(value=0.0, description="Reading 'normal' images", max=986708.0, style=ProgressStyâ€¦