### 1 Import relevant libraries

In [15]:
# import relevant libraries
import os # for file handling
import glob # for file handling
import csv # for reading the csv file
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
import random # for shuffling the data and seed setting
import cv2 # read and save images
import imageio # read and save images (for gif format)
from tqdm import tqdm # for progress bar

### 2 Define relevant variables and paths

In [16]:
# ID starts from 0 and goes up by one for each pair of image and mask
index = 1

# empty list to store the following information of each dataset:
# dataset name, file name image, file name mask, unique ID
data_list = []

# define dataset path
dataset_path = 'c:/Users/yileh/OneDrive/msc_data_science/master_thesis/datasets_raw'

# define target path for gray images
target_path_gray = 'c:/Users/yileh/OneDrive/msc_data_science/master_thesis/dataset_gray'

# define target path for RGB images
target_path_rgb = 'c:/Users/yileh/OneDrive/msc_data_science/master_thesis/dataset_rgb'

### 3 Create relevant folders (training, validation, and testing)

In [17]:
# function that creates folders if it does not exist
def create_folder(folder_path):
    '''
    Create folder if it does not exist.
    
    Args: 
        folder_path (str): Path of the folder to be created
    
    Returns: 
        None
    '''
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

In [18]:
# create training folder with images and masks subfolders for gray images
create_folder(os.path.join(target_path_gray, 'training/images'))
create_folder(os.path.join(target_path_gray, 'training/masks'))

# create validation folder with images and masks subfolders for gray images
create_folder(os.path.join(target_path_gray, 'validation/images'))
create_folder(os.path.join(target_path_gray, 'validation/masks'))

# create testing folder with images and masks subfolders for gray images
create_folder(os.path.join(target_path_gray, 'testing/images'))
create_folder(os.path.join(target_path_gray, 'testing/masks'))

In [19]:
# create training folder with images and masks subfolders for RGB images
create_folder(os.path.join(target_path_rgb, 'training/images'))
create_folder(os.path.join(target_path_rgb, 'training/masks'))

# create validation folder with images and masks subfolders for RGB images
create_folder(os.path.join(target_path_rgb, 'validation/images'))
create_folder(os.path.join(target_path_rgb, 'validation/masks'))

# create testing folder with images and masks subfolders for RGB images
create_folder(os.path.join(target_path_rgb, 'testing/images'))
create_folder(os.path.join(target_path_rgb, 'testing/masks'))

### 4 Fill the folders with images and masks

##### 4.1 Read in dataframe

In [20]:
# read in the csv file containing the dataset information as dataframe
df = pd.read_csv('c:/Users/yileh/OneDrive/msc_data_science/master_thesis/dataset_indexing.csv', index_col=None, dtype={'ID': str})

In [21]:
# inspect the first few rows of the dataframe
df.head()

Unnamed: 0,dataset_name,file_name_image,file_name_mask,ID
0,CHASEDB1,Image_01L.jpg,Image_01L_1stHO.png,1
1,CHASEDB1,Image_01R.jpg,Image_01R_1stHO.png,2
2,CHASEDB1,Image_02L.jpg,Image_02L_1stHO.png,3
3,CHASEDB1,Image_02R.jpg,Image_02R_1stHO.png,4
4,CHASEDB1,Image_03L.jpg,Image_03L_1stHO.png,5


##### 4.2 Get all IDs and shuffle them

In [22]:
# set seed for reproducibility
random.seed(42)

# get all IDs as a list
image_ids = list(df['ID'])

# shuffle the IDs
random.shuffle(image_ids)

In [23]:
# check how many images and masks are in the dataset
total_size = len(image_ids)
print(f'The amount of total images and masks is: {total_size}')

# check how many images and masks are in the training dataset
train_size = int(0.8 * len(image_ids))+1
print(f'The amount of training images and masks is: {train_size}')
# check how many images and masks are in the validation dataset
val_size = int(0.1 * len(image_ids))
print(f'The amount of validation images and masks is: {val_size}')
# check how many images and masks are in the testing dataset
test_size = int(0.1 * len(image_ids))
print(f'The amount of testing images and masks is: {test_size}')

The amount of total images and masks is: 781
The amount of training images and masks is: 625
The amount of validation images and masks is: 78
The amount of testing images and masks is: 78


In [24]:
# check if the sum of the training, validation and testing datasets is equal to the total amount of images and masks
if train_size + val_size + test_size == total_size:
    print('The sum of training, validation, and testing set is equal to the amount of total images and masks!')
else:
    print('The sum of training, validation, and testing set is not equal to the amount of total images and masks!')

The sum of training, validation, and testing set is equal to the amount of total images and masks!


##### 4.3 Add images and masks to the relevant folders after applying preprocessing

In [25]:
# create training, validation, and testing datasets
train_ids = image_ids[:train_size]
val_ids = image_ids[train_size:train_size+val_size]
test_ids = image_ids[train_size+val_size:]

In [26]:
# save parameters for preprocessing

# new image size
height = 1024
width = 1024

# for contrast-limited adaptive histogram equalization (CLAHE)
clahe_value = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))

# for gamma correction
gamma_value = 1.2

# some images of the FIVES dataset have top-left a white Chinese sign (needs to be removed so that contrast increases afer normalization)
# for specification of Chinese sign that needs to be blacked
x_start, y_start, x_end, y_end = 0, 0, 150, 150

In [27]:
# create function to copy images and masks to the target path
def copy_images_and_masks_to_target_path(image_ids:list[str], 
                                         source_dataset_path:str, 
                                         target_dataset_path:str,
                                         split:str, 
                                         resize:bool=False, 
                                         width:int=1024, 
                                         height:int=1024,
                                         target_channel:str='RGB', 
                                         preprocessing:bool=False):
    '''
    Copy images and masks from the source dataset path to the target dataset path with optional resizing and preprocessing.
    
    It is necessary to specify which split the images and masks belong to. The split can be 'training', 'validation', or 'testing'. 
    
    Resizing is optional and can be specified with the width and height parameters. The default size is 1024x1024.
    
    If preprocessing is used the parameters for CLAHE (Contrast-Limited Adaptive Histogram Equalization) and gamma correction have to be specified before using the function. 
    The preprocessing has the following steps:
        1. Image graying
        2. Image normalization (min=0, max=255)
        3. CLAHE
        4. Gamma correction
    If the target channel is set to 'RGB' the images are not converted to grayscale and image normalization is skipped as colors are highly distorted after.
    
    Args:
        image_ids (list[str]): List of uniqe image IDs which are copied into target folders.
        source_dataset_path (str): Path to the source dataset.
        target_dataset_path (str): Path to the target dataset.
        split (str): Indicates the dataset split (either 'training', 'validation', 'testing').
        resize (bool): Whether to resize images and masks.
        width (int): Target width for resizing.
        height (int): Target height for resizing.
        target_channel (str): If image should be left as RGB or converted to grayscale (either 'RGB' or 'GRAY').
        preprocessing (bool): Whether to apply preprocessing steps.
    
    Returns:
        None
    '''
    
    # check if split parameter is valid
    if split not in ['training', 'validation', 'testing']:
        raise ValueError("Invalid value for 'split' parameter. Allowed values are 'training', 'validation', or 'testing'.")
    
    # check if target_channel parameter is valid
    if target_channel not in ['RGB', 'GRAY']:
        raise ValueError("Invalid value for 'target_channel' parameter. Allowed values are 'RGB' or 'GRAY'.")
    
    # copy images and masks to the corresponding folders
    for unique_id in tqdm(image_ids):
        
        # get the image and mask file name and its dataset name
        image_file = df[df['ID'] == unique_id]['file_name_image'].values[0]
        mask_file = df[df['ID'] == unique_id]['file_name_mask'].values[0]
        dataset_name = df[df['ID'] == unique_id]['dataset_name'].values[0]
        
        # for image
        # read image
        image = cv2.imread(os.path.join(source_dataset_path, dataset_name, 'images', image_file))
        
        # resizing part
        if resize==True:
            # resize image
            image = cv2.resize(image, (width, height))
        
        # target channel
        if target_channel=='GRAY':
            # preprocessing part
            if preprocessing==True:
                # image graying
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                # image normalization (min=0, max=255)
                # if from FIVES dataset, convert the Chinese sign on the top-left to black pixels
                # as these pixels are white (value=255), removing the Chinese increases the contrast after normalization
                if df[df['ID'] == unique_id]['dataset_name'].values[0] == 'FIVES':
                    image[y_start:y_end, x_start:x_end] = 0
                image = cv2.normalize(image, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
                
                # use contrast-limited adaptive histogram equalization (CLAHE)
                image = clahe_value.apply(image)
                
                # use gamma correction
                image = np.array(255*(image / 255) ** gamma_value, dtype='uint8')
        
        # target channel
        if target_channel=='RGB':
            # preprocessing part
            if preprocessing==True:     
                # use image normalization (min=0, max=255) (apply on L channel of LAB color space)           
                # use contrast-limited adaptive histogram equalization (CLAHE) (apply on L channel of LAB color space)
                # convert image to LAB color space
                lab_image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
                # split the LAB image into channels
                l_channel, a_channel, b_channel = cv2.split(lab_image)
                # apply image normalization to the L channel (min=0, max=255)
                l_channel = cv2.normalize(l_channel, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
                # apply CLAHE to the L channel
                l_channel = clahe_value.apply(l_channel)
                # apply gamma correction to the L channel
                l_channel = np.uint8(((l_channel / 255.0) ** gamma_value) * 255)
                
                # merge L channel with original A and B channels
                lab_image = cv2.merge((l_channel, a_channel, b_channel))
                # convert LAB image back to RGB color space
                image = cv2.cvtColor(lab_image, cv2.COLOR_LAB2BGR)
            
        # save in target directory as jpg
        cv2.imwrite(os.path.join(target_dataset_path, split, 'images', f'{unique_id}.jpg'), image)
        
        # for mask
        # read mask
        if mask_file.endswith('.gif'): # if mask is in gif format
            mask = imageio.mimread(os.path.join(dataset_path, dataset_name, 'masks', mask_file))[0]
        else: 
            mask = cv2.imread(os.path.join(dataset_path, dataset_name, 'masks', mask_file), cv2.IMREAD_GRAYSCALE)
        
        # resizing part
        if resize==True:
            # resize mask
            mask = cv2.resize(mask, (width, height))
        
        # save in target directory as jpg
        cv2.imwrite(os.path.join(target_dataset_path, split, 'masks', f'{unique_id}.jpg'), mask)

In [28]:
# apply the function to the training dataset (rgb images)
copy_images_and_masks_to_target_path(train_ids, dataset_path, target_path_rgb, 'training', resize=True, width=width, height=height, target_channel='RGB', preprocessing=True)

100%|██████████| 625/625 [02:20<00:00,  4.44it/s]


In [29]:
# apply the function to the validation dataset (rgb images)
copy_images_and_masks_to_target_path(val_ids, dataset_path, target_path_rgb, 'validation', resize=True, width=width, height=height, target_channel='RGB', preprocessing=True)

100%|██████████| 78/78 [00:25<00:00,  3.01it/s]


In [30]:
# apply the function to the testing dataset (rgb images)
copy_images_and_masks_to_target_path(test_ids, dataset_path, target_path_rgb, 'testing', resize=True, width=width, height=height, target_channel='RGB', preprocessing=True)

100%|██████████| 78/78 [00:25<00:00,  3.00it/s]


In [31]:
# apply the function to the training dataset (gray images)
copy_images_and_masks_to_target_path(train_ids, dataset_path, target_path_gray, 'training', resize=True, width=width, height=height, target_channel='GRAY', preprocessing=True)

100%|██████████| 625/625 [03:37<00:00,  2.87it/s]


In [32]:
# apply the function to the validation dataset (gray images)
copy_images_and_masks_to_target_path(val_ids, dataset_path, target_path_gray, 'validation', resize=True, width=width, height=height, target_channel='GRAY', preprocessing=True)

100%|██████████| 78/78 [00:11<00:00,  6.54it/s]


In [33]:
# apply the function to the testing dataset (gray images)
copy_images_and_masks_to_target_path(test_ids, dataset_path, target_path_gray, 'testing', resize=True, width=width, height=height, target_channel='GRAY', preprocessing=True)

100%|██████████| 78/78 [00:11<00:00,  6.59it/s]
