In [1]:
import os
import sys

import numpy as np
import pandas as pd

import PIL
from PIL import Image
import cv2

# import pytorch
import torch

from tqdm import tqdm
# enable tqdm for pandas
tqdm.pandas()

In [215]:
# load ..\data\vietai-advance-retinal-disease-detection-2020\train.csv
train_csv = pd.read_csv('../data/vietai-advance-retinal-disease-detection-2020/train.csv')
train_csv.head()

Unnamed: 0,filename,opacity,diabetic retinopathy,glaucoma,macular edema,macular degeneration,retinal vascular occlusion,normal
0,c24a1b14d253.jpg,0,0,0,0,0,1,0
1,9ee905a41651.jpg,0,0,0,0,0,1,0
2,3f58d128caf6.jpg,0,0,1,0,0,0,0
3,4ce6599e7b20.jpg,1,0,0,0,1,0,0
4,0def470360e4.jpg,1,0,0,0,1,0,0


In [216]:
# remove all rows where macular edema or retinal vascular occlusion are 1
train_csv = train_csv[(train_csv['macular edema'] == 0) & (train_csv['retinal vascular occlusion'] == 0)]
# remove the columns
train_csv = train_csv.drop(columns=['macular edema', 'retinal vascular occlusion'])
train_csv.head()

Unnamed: 0,filename,opacity,diabetic retinopathy,glaucoma,macular degeneration,normal
2,3f58d128caf6.jpg,0,0,1,0,0
3,4ce6599e7b20.jpg,1,0,0,1,0
4,0def470360e4.jpg,1,0,0,1,0
12,802b4bfabd52.jpg,0,0,0,1,0
13,7bcfab1fa2da.jpg,1,0,0,0,0


In [217]:
# remove the opacity column
train_csv = train_csv.drop(columns=['opacity'])
# use filename as index
train_csv = train_csv.set_index('filename')

In [218]:
# for any row, if more than 1 column is 1, delete the row
train_csv['sum'] = train_csv.sum(axis=1)
train_csv = train_csv[train_csv['sum'] == 1]
train_csv = train_csv.drop(columns=['sum'])
# get the number of rows
train_csv.shape

(1901, 4)

In [219]:
train_csv.head()

Unnamed: 0_level_0,diabetic retinopathy,glaucoma,macular degeneration,normal
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3f58d128caf6.jpg,0,1,0,0
4ce6599e7b20.jpg,0,0,1,0
0def470360e4.jpg,0,0,1,0
802b4bfabd52.jpg,0,0,1,0
597b3e48c937.jpg,0,0,1,0


In [220]:
# rename diabetic retinopathy to D, glaucoma to G, macular degeneration to A, normal to N
train_csv = train_csv.rename(columns={'diabetic retinopathy': 'D', 'glaucoma': 'G', 'macular degeneration': 'A', 'normal': 'N'})

In [221]:
# from the folder data\vietai-advance-retinal-disease-detection-2020\train\train\, take all the files with the filename in the index of train_csv and copy them to data\final\ using a tqdm progress bar
for filename in tqdm(train_csv.index):
    # copy the file
    os.system(f'copy ..\\data\\vietai-advance-retinal-disease-detection-2020\\train\\train\\{filename} ..\\data\\final\\')

100%|██████████| 1901/1901 [00:50<00:00, 37.95it/s]


In [222]:
# load in ..\data\FIVES A Fundus Image Dataset for AI-based Vessel Segmentation\ csv files
fives_train_csv = pd.read_csv('../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/train.csv')
fives_test_csv = pd.read_csv('../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/test.csv')

In [223]:
fives_train_csv.shape

(600, 6)

In [224]:
# only keep the rows where IC, Blur, and LC are all 1
fives_train_csv = fives_train_csv[(fives_train_csv['IC'] == 1) & (fives_train_csv['Blur'] == 1) & (fives_train_csv['LC'] == 1)]
fives_train_csv = fives_train_csv.drop(columns=['IC', 'Blur', 'LC'])
fives_train_csv.shape

(459, 3)

In [225]:
fives_test_csv.shape

(200, 5)

In [226]:
# same thing with the test csv
fives_test_csv = fives_test_csv[(fives_test_csv['IC'] == 1) & (fives_test_csv['Blur'] == 1) & (fives_test_csv['LC'] == 1)]
fives_test_csv = fives_test_csv.drop(columns=['IC', 'Blur', 'LC'])
fives_test_csv.shape

(133, 2)

In [227]:
# remove the last column
fives_train_csv = fives_train_csv.drop(columns=['Unnamed: 5'])
fives_train_csv.head()

Unnamed: 0,Disease,Number
0,A,1
1,A,2
2,A,3
3,A,4
4,A,5


In [228]:
fives_test_csv.head()

Unnamed: 0,Disease,Number
0,A,1
1,A,2
2,A,3
3,A,4
4,A,5


In [229]:
# create a new column in fives_train_csv called 'filename' with entries being 'train_{Number}_{Disease}.jpg'
fives_train_csv['filename'] = fives_train_csv.apply(lambda row: f'train_{row["Number"]}_{row["Disease"]}.jpg', axis=1)
# same thing with fives_test_csv
fives_test_csv['filename'] = fives_test_csv.apply(lambda row: f'test_{row["Number"]}_{row["Disease"]}.jpg', axis=1)

In [230]:
# place disease in wide format with cell values being 1 or 0
fives_train_csv = fives_train_csv.pivot(index='filename', columns='Disease', values='Disease').fillna(0)
fives_test_csv = fives_test_csv.pivot(index='filename', columns='Disease', values='Disease').fillna(0)


In [231]:
# if the value in a cell is not 0, replace it with 1
fives_train_csv = fives_train_csv.applymap(lambda x: 1 if x != 0 else 0)
fives_test_csv = fives_test_csv.applymap(lambda x: 1 if x != 0 else 0)

  fives_train_csv = fives_train_csv.applymap(lambda x: 1 if x != 0 else 0)
  fives_test_csv = fives_test_csv.applymap(lambda x: 1 if x != 0 else 0)


In [232]:
# stack the two dataframes
fives_csv = pd.concat([fives_train_csv, fives_test_csv], axis=0)
fives_csv.head()

Disease,A,D,G,N
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train_100_A.jpg,1,0,0,0
train_101_A.jpg,1,0,0,0
train_103_A.jpg,1,0,0,0
train_104_A.jpg,1,0,0,0
train_105_A.jpg,1,0,0,0


In [233]:
# # print fives_csv index
# fives_csv.index

In [234]:
# Function to calculate black pixel proportion in an image
def calculate_black_proportion(image_path, threshold=5):
    # Read the image
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    
    # Calculate the total number of pixels
    total_pixels = img.size
    
    # Count black pixels (below the threshold)
    black_pixels = np.sum(img < threshold)
    
    # Return the proportion of black pixels
    return black_pixels / total_pixels

# reference image path
reference_image_path = '../data/final/f21977036e67.jpg'

# Calculate the black pixel proportion for the reference image
reference_black_proportion = calculate_black_proportion(reference_image_path)

# tolerance level
tolerance = 0.10

# Folder containing the images
image_dir = '../data/final/'

for filename in os.listdir(image_dir):
    image_path = os.path.join(image_dir, filename)
    
    # Calculate black proportion for the current image
    black_proportion = calculate_black_proportion(image_path)
    
    # Compare with reference image
    if black_proportion > reference_black_proportion * (1 + tolerance):
        # annotate image in the csv
        train_csv.loc[filename, 'Complete'] = 0
    else:
        train_csv.loc[filename, 'Complete'] = 1

In [235]:
# view value counts of the 'Complete' column
train_csv['Complete'].value_counts()

Complete
1.0    1596
0.0     305
Name: count, dtype: int64

In [236]:
train_csv[train_csv['Complete'] == 1].head()

Unnamed: 0_level_0,D,G,A,N,Complete
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3f58d128caf6.jpg,0,1,0,0,1.0
4ce6599e7b20.jpg,0,0,1,0,1.0
0def470360e4.jpg,0,0,1,0,1.0
802b4bfabd52.jpg,0,0,1,0,1.0
597b3e48c937.jpg,0,0,1,0,1.0


In [237]:
# only keep the rows where 'Complete' column is 1
train_csv = train_csv[train_csv['Complete'] == 1]
train_csv = train_csv.drop(columns=['Complete'])

# if the file is not in the df, delete it
for filename in os.listdir(image_dir):
    if filename not in train_csv.index:
        os.remove(os.path.join(image_dir, filename))

In [238]:
# get the number of rows in the df and the number of files in the folder
train_csv.shape, len(os.listdir(image_dir))

((1596, 4), 1596)

In [239]:
# for every image in ..\data\FIVES A Fundus Image Dataset for AI-based Vessel Segmentation\train\Original\, add train_ to the beginning of the filename
# for every image in ..\data\FIVES A Fundus Image Dataset for AI-based Vessel Segmentation\test\Original\, add test_ to the beginning of the filename
for filename in os.listdir('../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/train/Original/'):
    os.rename(f'../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/train/Original/{filename}', f'../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/train/Original/{filename}')

for filename in os.listdir('../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/test/Original/'):
    os.rename(f'../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/test/Original/{filename}', f'../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/test/Original/{filename}')

In [240]:
# Load the reference image (in grayscale) to create the mask
reference_image_path = '../data/final/f21977036e67.jpg'
reference_img = cv2.imread(reference_image_path, cv2.IMREAD_GRAYSCALE)

# binary mask from the reference image (black areas = 0, retina area = 1)
_, reference_mask = cv2.threshold(reference_img, 5, 255, cv2.THRESH_BINARY)

# Function to apply the reference mask on color images
def apply_reference_mask_color(image_path, output_path, mask):
    # Load the new image in color
    img = cv2.imread(image_path)
    
    # Ensure the new image has the same size as the reference image/mask
    if img.shape[:2] != mask.shape[:2]:
        img = cv2.resize(img, (mask.shape[1], mask.shape[0]))
    
    # 3-channel version of the mask for the color image
    mask_3channel = cv2.merge([mask, mask, mask])

    # Apply the mask
    masked_img = cv2.bitwise_and(img, mask_3channel)

    cv2.imwrite(output_path, masked_img)

In [241]:
# Apply the mask to all color images in the dataset
image_dir_train = '../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/train/Original/'
image_dir_test = '../data/FIVES A Fundus Image Dataset for AI-based Vessel Segmentation/test/Original/'
output_dir = '../data/final/'

In [242]:
for filename in os.listdir(image_dir_train):
    image_path = os.path.join(image_dir_train, filename)
    output_path = os.path.join(output_dir, filename.replace('.png', '.jpg'))
    apply_reference_mask_color(image_path, output_path, reference_mask)

In [243]:
for filename in os.listdir(image_dir_test):
    image_path = os.path.join(image_dir_test, filename)
    output_path = os.path.join(output_dir, filename.replace('.png', '.jpg'))
    apply_reference_mask_color(image_path, output_path, reference_mask)

In [244]:
fives_csv.head()

Disease,A,D,G,N
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
train_100_A.jpg,1,0,0,0
train_101_A.jpg,1,0,0,0
train_103_A.jpg,1,0,0,0
train_104_A.jpg,1,0,0,0
train_105_A.jpg,1,0,0,0


In [245]:
# reorder the columns in train_csv to be in the same order as fives_csv
train_csv = train_csv[fives_csv.columns]

In [246]:
# label with the value being a list [A, D, G, N]
train_csv['label'] = train_csv.apply(lambda row: [row['A'], row['D'], row['G'], row['N']], axis=1)
fives_csv['label'] = fives_csv.apply(lambda row: [row['A'], row['D'], row['G'], row['N']], axis=1)

In [247]:
train_csv.head()

Unnamed: 0_level_0,A,D,G,N,label
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3f58d128caf6.jpg,0,0,1,0,"[0, 0, 1, 0]"
4ce6599e7b20.jpg,1,0,0,0,"[1, 0, 0, 0]"
0def470360e4.jpg,1,0,0,0,"[1, 0, 0, 0]"
802b4bfabd52.jpg,1,0,0,0,"[1, 0, 0, 0]"
597b3e48c937.jpg,1,0,0,0,"[1, 0, 0, 0]"


In [248]:
# remove the columns A, D, G, N
train_csv = train_csv.drop(columns=['A', 'D', 'G', 'N'])
fives_csv = fives_csv.drop(columns=['A', 'D', 'G', 'N'])
# turn the index into a column called filename
train_csv['filename'] = train_csv.index
fives_csv['filename'] = fives_csv.index
# reset the index
train_csv = train_csv.reset_index(drop=True)
fives_csv = fives_csv.reset_index(drop=True)
# stack the two dataframes
final_csv = pd.concat([train_csv, fives_csv], axis=0)
# reset the index
final_csv = final_csv.reset_index(drop=True)

In [249]:
# column order should be filename, label
final_csv = final_csv[['filename', 'label']]

In [250]:
# head of final_csv
final_csv.head()

Unnamed: 0,filename,label
0,3f58d128caf6.jpg,"[0, 0, 1, 0]"
1,4ce6599e7b20.jpg,"[1, 0, 0, 0]"
2,0def470360e4.jpg,"[1, 0, 0, 0]"
3,802b4bfabd52.jpg,"[1, 0, 0, 0]"
4,597b3e48c937.jpg,"[1, 0, 0, 0]"


In [256]:
# print len of files in output_dir
print(len(os.listdir(output_dir)))
counter = 0
# for the files in output_dir, for the filenames starting with either train_ or test_, if the filename is not in fives_csv.index, delete the file
for filename in os.listdir(output_dir):
    if filename.startswith('train_') or filename.startswith('test_'):
        if filename not in final_csv['filename'].values:
            os.remove(os.path.join(output_dir, filename))
            # print(filename)
            counter += 1

print(counter)
# len of files in output_dir
print(len(os.listdir(output_dir)))

2396
208
2188


In [257]:
# make sure only the files in the folder are in the csv
for filename in final_csv['filename']:
    if not os.path.exists(f'../data/final/{filename}'):
        # remove the row
        final_csv = final_csv[final_csv['filename'] != filename]

In [258]:
# turn the df into a dict
final_csv_dict = final_csv.to_dict(orient='records')

In [259]:
# turn each filename into a key and the label into a value
final_csv_dict = {row['filename']: row['label'] for row in final_csv_dict}
final_csv_dict

{'3f58d128caf6.jpg': [0, 0, 1, 0],
 '4ce6599e7b20.jpg': [1, 0, 0, 0],
 '0def470360e4.jpg': [1, 0, 0, 0],
 '802b4bfabd52.jpg': [1, 0, 0, 0],
 '597b3e48c937.jpg': [1, 0, 0, 0],
 '4bf3e70ef40b.jpg': [1, 0, 0, 0],
 'b2dfb1372f52.jpg': [0, 1, 0, 0],
 'b1f2d17c07f0.jpg': [0, 1, 0, 0],
 '0ea8bcd5d303.jpg': [1, 0, 0, 0],
 'cd012eeaa6d6.jpg': [0, 1, 0, 0],
 '84889f768508.jpg': [0, 1, 0, 0],
 '1decb26d1151.jpg': [0, 1, 0, 0],
 '5e23d25c2097.jpg': [0, 1, 0, 0],
 '793ddb427eec.jpg': [0, 1, 0, 0],
 '68e7462b870f.jpg': [0, 1, 0, 0],
 'f296dbe7fd0a.jpg': [0, 1, 0, 0],
 'cb347296b274.jpg': [0, 1, 0, 0],
 'afd3afb6fd29.jpg': [0, 0, 1, 0],
 '0b2ddf34a500.jpg': [0, 0, 1, 0],
 '632cbfdeb992.jpg': [0, 0, 1, 0],
 'a6c6487951d4.jpg': [0, 1, 0, 0],
 'eb8850b40d1d.jpg': [0, 1, 0, 0],
 'b8e83a405b7b.jpg': [0, 0, 1, 0],
 'ac40a08f5ee4.jpg': [0, 0, 1, 0],
 '8ccb32515935.jpg': [0, 1, 0, 0],
 'e66a1c4f9490.jpg': [0, 1, 0, 0],
 '94dfc338828b.jpg': [1, 0, 0, 0],
 '6a7cf7cd8eb0.jpg': [0, 1, 0, 0],
 'f0290980da7c.jpg':

In [260]:
len(final_csv_dict)

2188

In [261]:
# save in ..\data\labels.csv
final_csv.to_csv('../data/labels.csv', index=False)
# save as a pt file
torch.save(final_csv_dict, '../data/labels.pt')

In [2]:
labels = torch.load('../data/labels.pt')

  labels = torch.load('../data/labels.pt')


In [3]:
# vertical sum of the labels
sum_labels = np.zeros(4)
for key in labels.keys():
    sum_labels += labels[key]
sum_labels

array([682., 542., 566., 398.])

In [7]:
# using the .npy indices in ../splits/, find the sums of the labels in the training set, validation set, and test set
train_indices = np.load('../splits/train_indices.npy')
val_indices = np.load('../splits/val_indices.npy')
test_indices = np.load('../splits/test_indices.npy')

print(train_indices)

train_labels = np.zeros(4)
val_labels = np.zeros(4)
test_labels = np.zeros(4)

for n, key in enumerate(labels.keys()):
    if n in train_indices:
        train_labels += labels[key]
    elif n in val_indices:
        val_labels += labels[key]
    elif n in test_indices:
        test_labels += labels[key]

train_labels, val_labels, test_labels

[1274 2064 1117 ... 1848 1283  936]


(array([477., 379., 396., 279.]),
 array([137., 109., 113.,  79.]),
 array([68., 54., 57., 40.]))

In [2]:
# # resize all images in ..\data\final\ to 256x256
# for filename in os.listdir('../data/final/'):
#     if filename.endswith('.jpg'):
#         img = Image.open(f'../data/final/{filename}')
#         img = img.resize((256, 256))
#         img.save(f'../data/final/{filename}')