## Data organisation example - Messidor2

In [10]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [11]:
# count total images in messidor2 folder
working_dir = os.getcwd()
print("Total messidor2 images:", len(os.listdir(os.path.join(working_dir, "data/messidor2/IMAGES/"))))

Total messidor2 images: 1748


## Preprocess label_df

In [12]:
working_dir = os.getcwd()
label_df = pd.read_csv(os.path.join(working_dir, "data/messidor2/messidor_data.csv"))
label_df = label_df[["image_id", "adjudicated_dr_grade"]]
print("Shape:", label_df.shape)
label_df.head()

Shape: (1748, 2)


Unnamed: 0,image_id,adjudicated_dr_grade
0,20051020_43808_0100_PP.png,0.0
1,20051020_43832_0100_PP.png,1.0
2,20051020_43882_0100_PP.png,1.0
3,20051020_43906_0100_PP.png,2.0
4,20051020_44261_0100_PP.png,0.0


In [13]:
# substitute all jpg by JPG
label_df.image_id = label_df.image_id.str.replace(".jpg", ".JPG")
label_df.iloc[-1, :]

  label_df.image_id = label_df.image_id.str.replace(".jpg", ".JPG")


image_id                IM004832.JPG
adjudicated_dr_grade             0.0
Name: 1747, dtype: object

In [14]:
# count number of missings in adjudicated_dr_grade
label_df = label_df[~label_df.adjudicated_dr_grade.isna()]
label_df.shape

(1744, 2)

In [15]:
# We will do 3-class classification
label_df.loc[label_df.adjudicated_dr_grade == 2, "adjudicated_dr_grade"] = 1
label_df.loc[label_df.adjudicated_dr_grade == 3, "adjudicated_dr_grade"] = 1
label_df.loc[label_df.adjudicated_dr_grade == 4, "adjudicated_dr_grade"] = 2

print("Unique values:", pd.unique(label_df.adjudicated_dr_grade))

Unique values: [0. 1. 2.]


In [16]:
label_train, label_test = train_test_split(label_df, stratify=label_df.adjudicated_dr_grade, test_size=0.3, random_state=0)
label_train, label_val = train_test_split(label_train, stratify=label_train.adjudicated_dr_grade, test_size=0.2, random_state=0)

print("train-val-test size:", 
      label_train.shape[0],
      label_val.shape[0], 
      label_test.shape[0])

train-val-test size: 976 244 524


In [17]:
# check label distribution in train, val and test data
_, label_train_counts = np.unique(label_train.adjudicated_dr_grade, return_counts=True)
print("Train label percentage:", label_train_counts / sum(label_train_counts))

_, label_val_counts = np.unique(label_val.adjudicated_dr_grade, return_counts=True)
print("Validation label percentage:", label_val_counts / sum(label_val_counts))

_, label_test_counts = np.unique(label_test.adjudicated_dr_grade, return_counts=True)
print("Test label percentage:", label_test_counts / sum(label_test_counts))

Train label percentage: [0.5829918  0.39651639 0.0204918 ]
Validation label percentage: [0.58196721 0.39754098 0.0204918 ]
Test label percentage: [0.58396947 0.39694656 0.01908397]


## Separate images to train folder

In [18]:
noDR_train = label_train.loc[label_train['adjudicated_dr_grade']==0, 'image_id']
NPDR_train = label_train.loc[label_train['adjudicated_dr_grade']==1, 'image_id']
PDR_train = label_train.loc[label_train['adjudicated_dr_grade']==2, 'image_id']
len(noDR_train) + len(NPDR_train)  + len(PDR_train)

976

In [19]:
for i in noDR_train:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/train/noDR/{}'.format(i)))
    
for i in NPDR_train:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/train/NPDR/{}'.format(i)))
    
for i in PDR_train:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/train/PDR/{}'.format(i)))


In [20]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Train set size:", \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/train/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/train/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/train/PDR/")))) 

Train set size: 976


In [21]:
working_dir = os.getcwd()
print("Train set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/train/noDR/"))))
print("Train set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/train/NPDR/"))))
print("Train set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/train/PDR/"))))

Train set w/o DR size: 569
Train set w NPDR size: 387
Train set w PDR size: 20


## Separate images to validation folder

In [22]:
noDR_val = label_val.loc[label_val['adjudicated_dr_grade']==0, 'image_id']
NPDR_val = label_val.loc[label_val['adjudicated_dr_grade']==1, 'image_id']
PDR_val = label_val.loc[label_val['adjudicated_dr_grade']==2, 'image_id']
len(noDR_val) + len(NPDR_val) + len(PDR_val)

244

In [23]:
for i in noDR_val:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/val/noDR/{}'.format(i)))
   
for i in NPDR_val:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/val/NPDR/{}'.format(i)))

for i in PDR_val:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/val/PDR/{}'.format(i)))


In [24]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Validation set size:", \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/val/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/val/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/val/PDR/")))) 

Validation set size: 244


In [25]:
working_dir = os.getcwd()
print("Validation set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/val/noDR/"))))
print("Validation set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/val/NPDR/"))))
print("Validation set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/val/PDR/"))))

Validation set w/o DR size: 142
Validation set w NPDR size: 97
Validation set w PDR size: 5


## Separate images to test folder

In [26]:
noDR_test = label_test.loc[label_test['adjudicated_dr_grade']==0, 'image_id']
NPDR_test = label_test.loc[label_test['adjudicated_dr_grade']==1, 'image_id']
PDR_test = label_test.loc[label_test['adjudicated_dr_grade']==2, 'image_id']
len(noDR_test) + len(NPDR_test) + len(PDR_test)

524

In [27]:
for i in noDR_test:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/test/noDR/{}'.format(i)))
   
for i in NPDR_test:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/test/NPDR/{}'.format(i)))

for i in PDR_test:
    shutil.copy(os.path.join(working_dir, 'data/messidor2/IMAGES/{}'.format(i)), os.path.join(working_dir, 'data/messidor2/test/PDR/{}'.format(i)))


In [28]:
# check number of images in train, val and test sets
working_dir = os.getcwd()
print("Test set size:", \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/test/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/test/NPDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/messidor2/test/PDR/")))) 

Test set size: 524


In [29]:
working_dir = os.getcwd()
print("Test set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/test/noDR/"))))
print("Test set w NPDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/test/NPDR/"))))
print("Test set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/messidor2/test/PDR/"))))

Test set w/o DR size: 306
Test set w NPDR size: 208
Test set w PDR size: 10
