In [1]:
from pathlib import Path
from shutil import copyfile
from sklearn.model_selection import train_test_split
import numpy as np
import csv

In [2]:
def get_files(dir_path, ext):
    return [str(path.name) for path in Path(dir_path).rglob('*.{}'.format(ext))]

In [3]:
files = get_files('data/GlaS', 'bmp')
files

['train_4.bmp',
 'testB_20.bmp',
 'train_31_anno.bmp',
 'testA_4_anno.bmp',
 'train_26_anno.bmp',
 'train_35_anno.bmp',
 'train_21.bmp',
 'testA_40.bmp',
 'testB_14.bmp',
 'train_44.bmp',
 'testA_27_anno.bmp',
 'testB_10_anno.bmp',
 'testA_34_anno.bmp',
 'train_74.bmp',
 'testA_11_anno.bmp',
 'testA_58_anno.bmp',
 'train_58.bmp',
 'train_32_anno.bmp',
 'train_76_anno.bmp',
 'testA_16.bmp',
 'testA_7.bmp',
 'testA_55.bmp',
 'train_40_anno.bmp',
 'testA_9.bmp',
 'testA_20_anno.bmp',
 'train_17.bmp',
 'testA_59_anno.bmp',
 'train_15.bmp',
 'train_55_anno.bmp',
 'train_69_anno.bmp',
 'train_17_anno.bmp',
 'train_71_anno.bmp',
 'testA_14_anno.bmp',
 'train_28_anno.bmp',
 'train_20_anno.bmp',
 'train_65.bmp',
 'train_68.bmp',
 'testA_47.bmp',
 'train_84_anno.bmp',
 'testA_45.bmp',
 'train_46_anno.bmp',
 'train_79_anno.bmp',
 'train_85.bmp',
 'train_30.bmp',
 'testA_17_anno.bmp',
 'train_38.bmp',
 'testA_28.bmp',
 'train_14_anno.bmp',
 'train_22.bmp',
 'train_8.bmp',
 'testA_56_anno.bmp',
 't

In [4]:
len(files)

330

In [5]:
data_dir = 'data/GlaS'
ext = 'bmp'

In [6]:
def csv_reader(fname):
    with open(fname, 'r') as f:
        out = list(csv.reader(f))
    return out

In [7]:
csv_reader('data/GlaS/Grade.csv')

[['name',
  'patient ID',
  ' grade (GlaS)',
  ' grade (Sirinukunwattana et al. 2015)'],
 ['testA_1', '4', ' benign', ' adenomatous'],
 ['testA_10', '10', ' benign', ' healthy'],
 ['testA_11', '9', ' benign', ' healthy'],
 ['testA_12', '11', ' malignant', ' poorly differentiated'],
 ['testA_13', '7', ' malignant', ' moderately differentiated'],
 ['testA_14', '7', ' malignant', ' moderately differentiated'],
 ['testA_15', '7', ' malignant', ' moderately differentiated'],
 ['testA_16', '4', ' malignant', ' moderately differentiated'],
 ['testA_17', '7', ' malignant', ' moderately differentiated'],
 ['testA_18', '9', ' benign', ' healthy'],
 ['testA_19', '11', ' benign', ' adenomatous'],
 ['testA_2', '3', ' benign', ' healthy'],
 ['testA_20', '9', ' benign', ' healthy'],
 ['testA_21', '9', ' benign', ' healthy'],
 ['testA_22', '11', ' malignant', ' moderately-to-poorly differentated'],
 ['testA_23', '4', ' malignant', ' poorly differentiated'],
 ['testA_24', '7', ' malignant', ' moderatel

In [8]:
def get_mask(file, files):
    name = file.replace('.bmp', '') + '_anno'
    return list(filter(lambda f: name in f , files))[0]

In [9]:
def get_label(file, csv):
    name = file.replace('.bmp', '')
    lbl = None
    for line in csv:
        if line[0] == name:
            lbl = line[2]
            break
    return lbl

In [18]:
def write_splits_csv(name, files, masks, labels):
    with open(name, 'w') as out:
        filewriter = csv.writer(out, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for file, mask, label in zip(files, masks, labels):
            filewriter.writerow([file, mask, label])

In [19]:
def create_glas_split(val_size=0.2):
    
    files = get_files('data/GlaS', 'bmp')
    csv = csv_reader('data/GlaS/Grade.csv')
    
    train_files = list(filter(lambda f: 'train' in f and 'anno' not in f, files))
    test_files = list(filter(lambda f: 'test' in f and 'anno' not in f, files))
    train_files, val_files = train_test_split(train_files, test_size=val_size)
    
    train_masks = [get_mask(f, files) for f in train_files]
    test_masks = [get_mask(f, files) for f in test_files]
    val_masks = [get_mask(f, files) for f in val_files]
    
    train_labels = [get_label(f, csv) for f in train_files]
    test_labels = [get_label(f, csv) for f in test_files]
    val_labels = [get_label(f, csv) for f in val_files]
    
    base = 'data/splits/glas/{}'
    
    write_splits_csv(base.format('train.csv'), train_files, train_masks, train_labels)
    write_splits_csv(base.format('test.csv'), test_files, test_masks, test_labels)
    write_splits_csv(base.format('val.csv'), val_files, val_masks, val_labels)

In [20]:
create_glas_split()