## Data organisation example - BRSET

In [1]:
import os
import shutil
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# from PIL import Image
# import torchvision.transforms.functional as TF
# import torch

### Check number of document in images folder

In [3]:
working_dir = os.getcwd()
len(os.listdir(os.path.join(working_dir, "data/BRSET/images/")))

4652

### Preprocess label_df

In [2]:
working_dir = os.getcwd()
label_df = pd.read_csv(os.path.join(working_dir, "data/BRSET/small_brazil_labels.csv"))
label_df = label_df[["image_id", "DR_ICDR"]]
print("Shape:", label_df.shape)
label_df.head()

Shape: (4652, 2)


Unnamed: 0,image_id,DR_ICDR
0,img10061,0.0
1,img00495,3.0
2,img06661,0.0
3,img15417,0.0
4,img13888,0.0


In [4]:
print("Unique values:", pd.unique(label_df.DR_ICDR))

Unique values: [0. 3. 4. 2. 1.]


In [5]:
label_train, label_test = train_test_split(label_df, stratify=label_df.DR_ICDR, test_size=0.3, random_state=0)
label_train, label_val = train_test_split(label_train, stratify=label_train.DR_ICDR, test_size=0.2, random_state=0)

print("train-val-test size:", 
      label_train.shape[0],
      label_val.shape[0], 
      label_test.shape[0])

train-val-test size: 2604 652 1396


In [6]:
# check label distribution in train, val and test data
_, brset_label_train_counts = np.unique(label_train.DR_ICDR, return_counts=True)
print("Train label percentage:", brset_label_train_counts / sum(brset_label_train_counts))

_, brset_label_val_counts = np.unique(label_val.DR_ICDR, return_counts=True)
print("Validation label percentage:", brset_label_val_counts / sum(brset_label_val_counts))

_, brset_label_test_counts = np.unique(label_test.DR_ICDR, return_counts=True)
print("Test label percentage:", brset_label_test_counts / sum(brset_label_test_counts))

Train label percentage: [0.93202765 0.01036866 0.01958525 0.01228879 0.02572965]
Validation label percentage: [0.93251534 0.0107362  0.01993865 0.01226994 0.02453988]
Test label percentage: [0.93266476 0.01002865 0.02005731 0.01217765 0.02507163]


In [7]:
# save label_train.csv for computing train image mean & std
label_train.to_csv("data/BRSET/small_brset_train_labels.csv")

### Separate images to train folder

In [8]:
noDR_train = label_train.loc[label_train['DR_ICDR']==0, 'image_id']
NPDR1_train = label_train.loc[label_train['DR_ICDR']==1, 'image_id']
NPDR2_train = label_train.loc[label_train['DR_ICDR']==2, 'image_id']
NPDR3_train = label_train.loc[label_train['DR_ICDR']==3, 'image_id']
PDR_train = label_train.loc[label_train['DR_ICDR']==4, 'image_id']

len(noDR_train) + len(NPDR1_train) + len(NPDR2_train) + len(NPDR3_train) + len(PDR_train)

2604

In [9]:
# for i in noDR_train:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/noDR/{}.png'.format(i)))
    
# for i in NPDR1_train:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/NPDR1/{}.png'.format(i)))

# for i in NPDR2_train:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/NPDR2/{}.png'.format(i)))

# for i in NPDR3_train:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/NPDR3/{}.png'.format(i)))
   
# for i in PDR_train:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/train/PDR/{}.png'.format(i)))


In [10]:
# check number of images in train, val and test sets
working_dir = os.getcwd()

print("Train set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/train/PDR/"))))

working_dir = os.getcwd()
print("Train set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/noDR/"))))
print("Train set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR1/"))))
print("Train set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR2/"))))
print("Train set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/NPDR3/"))))
print("Train set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/train/PDR/"))))

Train set size: 2604
Train set w/o DR size: 2427
Train set w NPDR1 size: 27
Train set w NPDR2 size: 51
Train set w NPDR3 size: 32
Train set w PDR size: 67


### Separate images to val folder

In [11]:
noDR_val = label_val.loc[label_val['DR_ICDR']==0, 'image_id']
NPDR1_val = label_val.loc[label_val['DR_ICDR']==1, 'image_id']
NPDR2_val = label_val.loc[label_val['DR_ICDR']==2, 'image_id']
NPDR3_val = label_val.loc[label_val['DR_ICDR']==3, 'image_id']
PDR_val = label_val.loc[label_val['DR_ICDR']==4, 'image_id']

len(noDR_val) + len(NPDR1_val) + len(NPDR2_val) + len(NPDR3_val) + len(PDR_val)

652

In [12]:
for i in noDR_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/noDR/{}.png'.format(i)))
    
for i in NPDR1_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/NPDR1/{}.png'.format(i)))

for i in NPDR2_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/NPDR2/{}.png'.format(i)))

for i in NPDR3_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/NPDR3/{}.png'.format(i)))
   
for i in PDR_val:
    shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/val/PDR/{}.png'.format(i)))


In [13]:
# check number of images in train, val and test sets
working_dir = os.getcwd()

print("Validation set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/val/PDR/"))))

working_dir = os.getcwd()
print("Validation set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/noDR/"))))
print("Validation set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR1/"))))
print("Validation set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR2/"))))
print("Validation set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/NPDR3/"))))
print("Validation set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/val/PDR/"))))

Validation set size: 652
Validation set w/o DR size: 608
Validation set w NPDR1 size: 7
Validation set w NPDR2 size: 13
Validation set w NPDR3 size: 8
Validation set w PDR size: 16


### Separate images to test folder

In [8]:
noDR_test = label_test.loc[label_test['DR_ICDR']==0, 'image_id']
NPDR1_test = label_test.loc[label_test['DR_ICDR']==1, 'image_id']
NPDR2_test = label_test.loc[label_test['DR_ICDR']==2, 'image_id']
NPDR3_test = label_test.loc[label_test['DR_ICDR']==3, 'image_id']
PDR_test = label_test.loc[label_test['DR_ICDR']==4, 'image_id']

len(noDR_test) + len(NPDR1_test) + len(NPDR2_test) + len(NPDR3_test) + len(PDR_test)

1396

In [9]:
# for i in noDR_test:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/noDR/{}.png'.format(i)))
    
# for i in NPDR1_test:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/NPDR1/{}.png'.format(i)))

# for i in NPDR2_test:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/NPDR2/{}.png'.format(i)))

# for i in NPDR3_test:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/NPDR3/{}.png'.format(i)))
   
# for i in PDR_test:
#     shutil.copy(os.path.join(working_dir, 'data/BRSET/images/{}.jpg'.format(i)), os.path.join(working_dir, 'data/BRSET/test/PDR/{}.png'.format(i)))


In [7]:
# check number of images in train, val and test sets
working_dir = os.getcwd()

print("Test set size:", \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/noDR/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR1/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR2/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR3/"))) + \
    len(os.listdir(os.path.join(working_dir, "data/BRSET/test/PDR/"))))

working_dir = os.getcwd()
print("Test set w/o DR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/noDR/"))))
print("Test set w NPDR1 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR1/"))))
print("Test set w NPDR2 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR2/"))))
print("Test set w NPDR3 size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/NPDR3/"))))
print("Test set w PDR size:", len(os.listdir(os.path.join(working_dir, "data/BRSET/test/PDR/"))))

Test set size: 1396
Test set w/o DR size: 1302
Test set w NPDR1 size: 14
Test set w NPDR2 size: 28
Test set w NPDR3 size: 17
Test set w PDR size: 35


### Compute mean image color

In [9]:
from tqdm import tqdm

image_mean = 0
image_std = 0
for image_name in tqdm(label_train.image_id):
    img_path = os.path.join(working_dir, "data/BRSET/images/", image_name + ".jpg")
    img = TF.to_tensor(Image.open(img_path))
    image_mean += torch.mean(img, dim=(1, 2))
    image_std += torch.std(img, dim=(1, 2))

  0%|          | 0/2604 [00:00<?, ?it/s]


NameError: name 'TF' is not defined

In [None]:
image_mean = image_mean / label_train.shape[0]
image_std = image_std / label_train.shape[0]

print("BRSET image means are:", image_mean)
print("BRSET image stds are:", image_std)

In [None]:
messidor_img_mean = torch.tensor([0.2859, 0.1341, 0.0471])
messidor_img_std = torch.tensor([0.3263, 0.1568, 0.0613])

In [None]:
avg_img_mean = (image_mean + messidor_img_mean) / 2
avg_img_std = (image_std + messidor_img_std) / 2
print("Avg mean:", avg_img_mean)
print("Avg std:", avg_img_std)