## Data organisation example - EyePACS

In [1]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# count total images in messidor2 folder
working_dir = os.getcwd()
print("Total eyepacs train images:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/images/train_small/"))))
print("Total eyepacs test images:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/images/test_small/"))))

Total eyepacs train images: 3513
Total eyepacs test images: 5358


## Preprocess label_df

In [3]:
working_dir = os.getcwd()
label_train = pd.read_csv(os.path.join(working_dir, "data/EyePACS/train_small_Labels.csv"))
label_test = pd.read_csv(os.path.join(working_dir, "data/EyePACS/test_small_Labels.csv"))

label_train = label_train[["image", "level"]]
label_test = label_test[["image", "level"]]

print("Train Data Shape:", label_train.shape)
print("Test Data Shape:", label_test.shape)

print("Display train labels:")
print(label_train.head())
print("Display test labels:")
print(label_test.head())

Train Data Shape: (3513, 2)
Test Data Shape: (5358, 2)
Display train labels:
         image  level
0    6221_left      0
1  32412_right      1
2  24863_right      0
3   18648_left      0
4  16862_right      0
Display test labels:
         image  level
0   18121_left      0
1  32524_right      2
2  18746_right      0
3   33129_left      0
4  15609_right      0


In [4]:
print("Train data unique values:", pd.unique(label_train.level))
print("Test data unique values:", pd.unique(label_test.level))

Train data unique values: [0 1 2 3 4]
Test data unique values: [0 2 1 4 3]


In [5]:
# label_train, label_test = train_test_split(label_df, stratify=label_df.adjudicated_dr_grade, test_size=0.3, random_state=0)
label_train, label_val = train_test_split(label_train, stratify=label_train.level, test_size=0.2, random_state=0)

print("train-val-test size:", 
      label_train.shape[0],
      label_val.shape[0], 
      label_test.shape[0])

train-val-test size: 2810 703 5358


In [6]:
# check label distribution in train, val and test data
_, label_train_counts = np.unique(label_train.level, return_counts=True)
print("Train label percentage:", label_train_counts / sum(label_train_counts))

_, label_val_counts = np.unique(label_val.level, return_counts=True)
print("Validation label percentage:", label_val_counts / sum(label_val_counts))

_, label_test_counts = np.unique(label_test.level, return_counts=True)
print("Test label percentage:", label_test_counts / sum(label_test_counts))

Train label percentage: [0.73451957 0.06975089 0.15053381 0.02491103 0.0202847 ]
Validation label percentage: [0.73541963 0.06970128 0.15078236 0.02418208 0.01991465]
Test label percentage: [0.73796193 0.07017544 0.14669653 0.02258305 0.02258305]


## Separate images to train folder

In [7]:
noDR_train = label_train.loc[label_train['level']==0, 'image']
NPDR1_train = label_train.loc[label_train['level']==1, 'image']
NPDR2_train = label_train.loc[label_train['level']==2, 'image']
NPDR3_train = label_train.loc[label_train['level']==3, 'image']
PDR_train = label_train.loc[label_train['level']==4, 'image']
len(noDR_train) + len(NPDR1_train) + len(NPDR2_train) + len(NPDR3_train) + len(PDR_train)

2810

In [8]:
for i in noDR_train:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/train/noDR/{}.jpeg'.format(i)))
    
for i in NPDR1_train:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/train/NPDR1/{}.jpeg'.format(i)))
    
for i in NPDR2_train:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/train/NPDR2/{}.jpeg'.format(i)))

for i in NPDR3_train:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/train/NPDR3/{}.jpeg'.format(i)))

for i in PDR_train:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/train/PDR/{}.jpeg'.format(i)))


In [9]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Train set size:", \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/PDR/")))) 

Train set size: 2810


In [10]:
working_dir = os.getcwd()
print("Train set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/noDR/"))))
print("Train set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR1/"))))
print("Train set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR2/"))))
print("Train set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/NPDR3/"))))
print("Train set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/train/PDR/"))))

Train set w/o DR size: 2064
Train set w NPDR1 size: 196
Train set w NPDR2 size: 423
Train set w NPDR3 size: 70
Train set w PDR size: 57


## Separate images to validation folder

In [11]:
noDR_val = label_val.loc[label_val['level']==0, 'image']
NPDR1_val = label_val.loc[label_val['level']==1, 'image']
NPDR2_val = label_val.loc[label_val['level']==2, 'image']
NPDR3_val = label_val.loc[label_val['level']==3, 'image']
PDR_val = label_val.loc[label_val['level']==4, 'image']
len(noDR_val) + len(NPDR1_val) + len(NPDR2_val) + len(NPDR3_val) + len(PDR_val)

703

In [12]:
for i in noDR_val:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/val/noDR/{}.jpeg'.format(i)))
   
for i in NPDR1_val:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/val/NPDR1/{}.jpeg'.format(i)))
    
for i in NPDR2_val:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/val/NPDR2/{}.jpeg'.format(i)))

for i in NPDR3_val:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/val/NPDR3/{}.jpeg'.format(i)))

for i in PDR_val:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/train_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/val/PDR/{}.jpeg'.format(i)))


In [13]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Validation set size:", \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/PDR/")))) 

Validation set size: 703


In [14]:
working_dir = os.getcwd()
print("Validation set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/noDR/"))))
print("Validation set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR1/"))))
print("Validation set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR2/"))))
print("Validation set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/NPDR3/"))))
print("Validation set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/val/PDR/"))))

Validation set w/o DR size: 517
Validation set w NPDR1 size: 49
Validation set w NPDR2 size: 106
Validation set w NPDR3 size: 17
Validation set w PDR size: 14


## Separate images to test folder

In [15]:
noDR_test = label_test.loc[label_test['level']==0, 'image']
NPDR1_test = label_test.loc[label_test['level']==1, 'image']
NPDR2_test = label_test.loc[label_test['level']==2, 'image']
NPDR3_test = label_test.loc[label_test['level']==3, 'image']
PDR_test = label_test.loc[label_test['level']==4, 'image']
len(noDR_test) + len(NPDR1_test) + len(NPDR2_test) + len(NPDR3_test) + len(PDR_test)

5358

In [16]:
for i in noDR_test:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/test_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/test/noDR/{}.jpeg'.format(i)))
   
for i in NPDR1_test:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/test_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/test/NPDR1/{}.jpeg'.format(i)))
    
for i in NPDR2_test:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/test_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/test/NPDR2/{}.jpeg'.format(i)))

for i in NPDR3_test:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/test_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/test/NPDR3/{}.jpeg'.format(i)))

for i in PDR_test:
    shutil.copy(os.path.join(working_dir, 'data/EyePACS/images/test_small/{}.jpeg'.format(i)), os.path.join(working_dir, 'data/EyePACS/test/PDR/{}.jpeg'.format(i)))


In [17]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Test set size:", \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/PDR/")))) 

Test set size: 5358


In [18]:
working_dir = os.getcwd()
print("Test set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/noDR/"))))
print("Test set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR1/"))))
print("Test set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR2/"))))
print("Test set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/NPDR3/"))))
print("Test set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/EyePACS/test/PDR/"))))

Test set w/o DR size: 3954
Test set w NPDR1 size: 376
Test set w NPDR2 size: 786
Test set w NPDR3 size: 121
Test set w PDR size: 121
