### 1 Import relevant libraries

In [1]:
# import relevant libraries
import os # for file handling
import glob # for file handling
import csv # for saving dataset information as csv
import pandas as pd # for data manipulation

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd # for data manipulation


### 2 Define relevant variables and paths

In [2]:
# ID starts from 0 and goes up by one for each pair of image and mask
index = 1

# empty list to store the following information of each dataset:
# dataset name, file name image, file name mask, unique ID
data_list = []

# define project path
project_path = 'c:/Users/yileh/OneDrive/msc_data_science/master_thesis'

# define dataset path
dataset_path = 'c:/Users/yileh/OneDrive/msc_data_science/master_thesis/datasets_raw'

### 3 Extract information of each dataset

##### 3.1 CHASEDB1 dataset

In [3]:
# get information of the first dataset: DRIVE
dataset_name = 'CHASEDB1'
file_name_image = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'images', '*.jpg')))
file_name_mask = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'masks', '*.png')))

In [4]:
# add the information to the dataframe
for i in range(len(file_name_image)):
    # append relevant information to the list
    data_list.append({'dataset_name': dataset_name, 
                      'file_name_image': os.path.basename(file_name_image[i]), 
                      'file_name_mask': os.path.basename(file_name_mask[i]), 
                      'ID': f'{index:03d}'}) # store the unique ID as string with 3 digits (e.g., 1 -> 001)
    # increase the index by 1 (unique ID)
    index += 1

##### 3.2 DRIVE dataset

In [5]:
# get information of the first dataset: DRIVE
dataset_name = 'DRIVE'
file_name_image = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'images', '*.tif')))
file_name_mask = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'masks', '*.gif')))

In [6]:
# add the information to the dataframe
for i in range(len(file_name_image)):
    # append relevant information to the list
    data_list.append({'dataset_name': dataset_name, 
                      'file_name_image': os.path.basename(file_name_image[i]), 
                      'file_name_mask': os.path.basename(file_name_mask[i]), 
                      'ID': f'{index:03d}'}) # store the unique ID with 3 digits (e.g., 1 -> 001)
    # increase the index by 1 (unique ID)
    index += 1

##### 3.3 FIVES dataset

In [7]:
# get information of the first dataset: DRIVE
dataset_name = 'FIVES'
file_name_image = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'images', '*.png')))
file_name_mask = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'masks', '*.png')))

Remove the images and masks that have issues as identified in the script ***exploratory_data_analysis.ipynb*** 

In [8]:
# open text file with the image and mask file names
with open(os.path.join(project_path, 'images_to_remove.txt'), 'r') as f:
    images_to_remove = [line.strip() for line in f.readlines()]
 
# remove the image and mask file names from the lists
for image_file in images_to_remove:
    # check if image file name is in the list and remove it if it is
    if os.path.join(dataset_path, dataset_name, 'images', image_file) in file_name_image:
        file_name_image.remove(os.path.join(dataset_path, dataset_name, 'images', image_file))
        file_name_mask.remove(os.path.join(dataset_path, dataset_name, 'masks', image_file))

In [9]:
# check that image and mask file names were removed
print(f'Number of images and masks after removing images and masks with issues: {len(file_name_image)}')

Number of images and masks after removing images and masks with issues: 668


In [10]:
# add the information to the dataframe
for i in range(len(file_name_image)):
    # append relevant information to the list
    data_list.append({'dataset_name': dataset_name, 
                      'file_name_image': os.path.basename(file_name_image[i]), 
                      'file_name_mask': os.path.basename(file_name_mask[i]), 
                      'ID': f'{index:03d}'}) # store the unique ID with 3 digits (e.g., 1 -> 001)
    # increase the index by 1 (unique ID)
    index += 1

##### 3.4 HRF dataset

In [11]:
# get information of the first dataset: DRIVE
dataset_name = 'HRF'
file_name_image = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'images', '*.jpg')))
file_name_mask = sorted(glob.glob(os.path.join(dataset_path, dataset_name, 'masks', '*.tif')))

In [12]:
# add the information to the dataframe
for i in range(len(file_name_image)):
    # append relevant information to the list
    data_list.append({'dataset_name': dataset_name, 
                      'file_name_image': os.path.basename(file_name_image[i]), 
                      'file_name_mask': os.path.basename(file_name_mask[i]), 
                      'ID': f'{index:03d}'}) # store the unique ID with 3 digits (e.g., 1 -> 001)
    # increase the index by 1 (unique ID)
    index += 1

### 4 Convert to pandas dataframe and save as csv

In [13]:
# convert to pandas dataframe
df = pd.DataFrame(data_list)

In [14]:
# inspect the first few rows
print(df.head())

  dataset_name file_name_image       file_name_mask   ID
0     CHASEDB1   Image_01L.jpg  Image_01L_1stHO.png  001
1     CHASEDB1   Image_01R.jpg  Image_01R_1stHO.png  002
2     CHASEDB1   Image_02L.jpg  Image_02L_1stHO.png  003
3     CHASEDB1   Image_02R.jpg  Image_02R_1stHO.png  004
4     CHASEDB1   Image_03L.jpg  Image_03L_1stHO.png  005


In [16]:
# save the dataframe to a csv file
df.to_csv('c:/Users/yileh/OneDrive/msc_data_science/master_thesis/dataset_indexing.csv', index=False)