## Data organisation example - BRSET

In [1]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Preprocess label_df

In [2]:
working_dir = os.getcwd()
label_df = pd.read_csv(os.path.join(working_dir, "data/BRSET/labels.csv"))
label_df = label_df[["image_id", "DR_ICDR"]]
print("Shape:", label_df.shape)
label_df.head()

Shape: (16266, 2)


Unnamed: 0,image_id,DR_ICDR
0,img00001,0
1,img00002,0
2,img00003,0
3,img00004,0
4,img00005,0


In [3]:
label_df.loc[label_df.DR_ICDR == 2, "DR_ICDR"] = 1
label_df.loc[label_df.DR_ICDR == 3, "DR_ICDR"] = 1
label_df.loc[label_df.DR_ICDR == 4, "DR_ICDR"] = 2

print("Unique values:", pd.unique(label_df.DR_ICDR))

Unique values: [0 1 2]


In [4]:
split_DR3class = pd.read_csv("/home/opc/Documents/GitHub/RETFound_MAE/data/BRSET/retfound_split_DR3class.csv").iloc[:, 1:]
train_indices = split_DR3class["index"][split_DR3class.split=="Train"]
test_indices = split_DR3class["index"][split_DR3class.split=="Test"]

label_train = label_df.iloc[train_indices, :]
label_test = label_df.iloc[test_indices, :]

print("train-test size:", 
      label_train.shape[0],
      label_test.shape[0])

train-test size: 11386 4880


In [5]:
label_test, label_val = train_test_split(label_test, stratify=label_test.DR_ICDR, test_size=0.15, random_state=42)

print("train-val-test size:", 
      label_train.shape[0],
      label_val.shape[0], 
      label_test.shape[0])

train-val-test size: 11386 732 4148


In [12]:
# label_train, label_test = train_test_split(label_df, stratify=label_df.DR_ICDR, test_size=0.3, random_state=0)
# label_train, label_val = train_test_split(label_train, stratify=label_train.DR_ICDR, test_size=0.2, random_state=0)

# print("train-val-test size:", 
#       label_train.shape[0],
#       label_val.shape[0], 
#       label_test.shape[0])

train-val-test size: 10409 2603 3254


In [6]:
# check label distribution in train, val and test data
_, brset_label_train_counts = np.unique(label_train.DR_ICDR, return_counts=True)
print("Train label percentage:", brset_label_train_counts / sum(brset_label_train_counts))

_, brset_label_val_counts = np.unique(label_val.DR_ICDR, return_counts=True)
print("Validation label percentage:", brset_label_val_counts / sum(brset_label_val_counts))

_, brset_label_test_counts = np.unique(label_test.DR_ICDR, return_counts=True)
print("Test label percentage:", brset_label_test_counts / sum(brset_label_test_counts))

Train label percentage: [0.9336905  0.04198138 0.02432812]
Validation label percentage: [0.93852459 0.03825137 0.02322404]
Test label percentage: [0.93828351 0.03760849 0.024108  ]


### Separate images to train folder

In [7]:
noDR_train = label_train.loc[label_train['DR_ICDR']==0, 'image_id']
NPDR_train = label_train.loc[label_train['DR_ICDR']==1, 'image_id']
PDR_train = label_train.loc[label_train['DR_ICDR']==2, 'image_id']
len(noDR_train) + len(NPDR_train) + len(PDR_train)

11386

In [9]:
for i in noDR_train:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/noDR/{}.png'.format(i)))
    
for i in NPDR_train:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/NPDR/{}.png'.format(i)))
    
for i in PDR_train:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/PDR/{}.png'.format(i)))


In [10]:
working_dir = os.getcwd()
print("Train set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/noDR/"))))
print("Train set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR/"))))
print("Train set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/PDR/"))))

Train set w/o DR size: 10631
Train set w NPDR size: 478
Train set w PDR size: 277


### Separate images to validation folder

In [11]:
noDR_val = label_val.loc[label_val['DR_ICDR']==0, 'image_id']
NPDR_val = label_val.loc[label_val['DR_ICDR']==1, 'image_id']
PDR_val = label_val.loc[label_val['DR_ICDR']==2, 'image_id']
len(noDR_val) + len(NPDR_val) + len(PDR_val)

732

In [12]:
for i in noDR_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/noDR/{}.png'.format(i)))
    
for i in NPDR_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/NPDR/{}.png'.format(i)))
    
for i in PDR_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/PDR/{}.png'.format(i)))


In [13]:
working_dir = os.getcwd()
print("Validation set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/noDR/"))))
print("Validation set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR/"))))
print("Validation set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/PDR/"))))

Validation set w/o DR size: 687
Validation set w NPDR size: 28
Validation set w PDR size: 17


### Separate images to test folder

In [14]:
noDR_test = label_test.loc[label_test['DR_ICDR']==0, 'image_id']
NPDR_test = label_test.loc[label_test['DR_ICDR']==1, 'image_id']
PDR_test = label_test.loc[label_test['DR_ICDR']==2, 'image_id']
len(noDR_test) + len(NPDR_test) + len(PDR_test)

4148

In [15]:
for i in noDR_test:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/noDR/{}.png'.format(i)))
    
for i in NPDR_test:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/NPDR/{}.png'.format(i)))
    
for i in PDR_test:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/PDR/{}.png'.format(i)))


In [16]:
working_dir = os.getcwd()
print("Test set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/noDR/"))))
print("Test set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR/"))))
print("Test set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/PDR/"))))

Test set w/o DR size: 3892
Test set w NPDR size: 156
Test set w PDR size: 100


In [17]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Train set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/PDR/"))))

print("Validation set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/PDR/"))))

print("Test set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/PDR/"))))

Train set size: 11386
Validation set size: 732
Test set size: 4148
