In [None]:
import h5py
import pandas as pd
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split

In [None]:
# read metadata
path = 'your_path/data/PAPILA/'

# OD for right, OS for left
od_meta = pd.read_csv(path + 'ClinicalData/patient_data_od.csv')
os_meta = pd.read_csv(path + 'ClinicalData/patient_data_os.csv')
od_meta.head()

In [None]:
os_meta.head()

In [None]:
ids = os_meta['ID'].values
os_path = ['RET' + x[1:] + 'OS.jpg' for x in ids]
os_meta['Path'] = os_path

In [None]:
ids = od_meta['ID'].values
od_path = ['RET' + x[1:] + 'OD.jpg' for x in ids]
od_meta['Path'] = od_path

In [None]:
meta_all = pd.concat([od_meta, os_meta])
subcolumns = ['ID', 'Age', 'Gender', 'Diagnosis', 'Path']
meta_all = meta_all[subcolumns]
meta_all

In [None]:
meta_all.to_csv(path + 'ClinicalData/patient_meta_concat.csv')

In [None]:
# the patient (0 for male and 1 for female), 
# the diagnosis (0 stands for healthy, 1 for glaucoma, and 2 for suspicious)

sex = meta_all['Gender'].values.astype('str')
sex[sex == '0.0'] = 'M'
sex[sex == '1.0'] = 'F'
meta_all['Sex'] = sex
meta_all

In [None]:
meta_all['Age_multi'] = meta_all['Age'].values.astype('int')
meta_all['Age_multi'] = np.where(meta_all['Age_multi'].between(0,19), 0, meta_all['Age_multi'])
meta_all['Age_multi'] = np.where(meta_all['Age_multi'].between(20,39), 1, meta_all['Age_multi'])
meta_all['Age_multi'] = np.where(meta_all['Age_multi'].between(40,59), 2, meta_all['Age_multi'])
meta_all['Age_multi'] = np.where(meta_all['Age_multi'].between(60,79), 3, meta_all['Age_multi'])
meta_all['Age_multi'] = np.where(meta_all['Age_multi']>=80, 4, meta_all['Age_multi'])

meta_all['Age_binary'] = meta_all['Age'].values.astype('int')
meta_all['Age_binary'] = np.where(meta_all['Age_binary'].between(0, 60), 0, meta_all['Age_binary'])
meta_all['Age_binary'] = np.where(meta_all['Age_binary']>= 60, 1, meta_all['Age_binary'])
meta_all

In [None]:
# binary , only use healthy and glaucoma, i.e. 0 and 1.

meta_binary = meta_all[(meta_all['Diagnosis'].values == 1.0) | (meta_all['Diagnosis'].values == 0.0)]
len(meta_binary)

In [None]:
def split_712(all_meta, patient_ids):
    sub_train, sub_val_test = train_test_split(patient_ids, test_size=0.3, random_state=5)
    sub_val, sub_test = train_test_split(sub_val_test, test_size=0.66, random_state=15)
    train_meta = all_meta[all_meta.ID.isin(sub_train)]
    val_meta = all_meta[all_meta.ID.isin(sub_val)]
    test_meta = all_meta[all_meta.ID.isin(sub_test)]
    return train_meta, val_meta, test_meta

sub_train, sub_val, sub_test = split_712(meta_binary, np.unique(meta_binary['ID']))

In [None]:
sub_train.to_csv('/yourpath/data/PAPILA/split/new_train.csv')
sub_val.to_csv('/yourpath/data/PAPILA/split/new_val.csv')
sub_test.to_csv('/yourpath/data/PAPILA/split/new_test.csv')