In [25]:
import pandas as pd
import numpy as np

In [26]:
index_class = {0:'No_DR', 1:'Mild', 2:'Moderate', 3:'Severe', 4:'Proliferative_DR'}
RAW_DATA_FOLDER = './resized_traintest15_train19'

# All labeled images (2015 train + 2015 test + 2019 train)

In [27]:
def read_dataset_dataframe(csv_path = 'labels/traintestLabels15_trainLabels19.csv', add_extension_flag = True):
    '''
    Read the dataframe that contain image filename and labels
    '''
    df = pd.read_csv(csv_path)
    df.columns = ['filename', 'class']

    mod_df = df.copy()
    mod_df['class'] = mod_df['class'].astype(str)
    
    if add_extension_flag :
        def add_extension(x):
            return x+'.jpg'
        mod_df['filename'] = mod_df['filename'].apply(add_extension)
    return mod_df

df = read_dataset_dataframe()

In [28]:
def split_train_validation_from_df(df, validation_ratio = 0.2, seed = 25):
    '''
    Split the DataFrame containing image filenames and their class
    into smaller DataFrame for training and validation.
    The function does stratified sampling.
    Inputs are: dataframe df, ratio for validation data, and a seed for random state
    '''
    train_df = pd.DataFrame()
    validation_df = pd.DataFrame()

    for c in df['class'].unique():
        subset = df[df['class'] == c]
        subset = subset.sample(frac = 1, random_state = seed)
        
        n_train = int((1 - validation_ratio) * subset.shape[0])
        train_subset = subset.iloc[:n_train,:]
        validation_subset = subset.iloc[n_train:,:]
        
        train_df = pd.concat([train_df, train_subset])
        validation_df = pd.concat([validation_df, validation_subset])
        
    print("Train dataframe shape:", train_df.shape)
    print("Train dataframe class counts:")
    print(train_df['class'].value_counts().sort_index())

    print("Validation dataframe shape:", validation_df.shape)
    print("Validation dataframe class counts:")
    print(validation_df['class'].value_counts().sort_index())

    return train_df, validation_df

train_df, validation_df = split_train_validation_from_df(df)

Train dataframe shape: (73890, 2)
Train dataframe class counts:
0    53718
1     5260
2    11321
3     1824
4     1767
Name: class, dtype: int64
Validation dataframe shape: (18474, 2)
Validation dataframe class counts:
0    13430
1     1315
2     2831
3      456
4      442
Name: class, dtype: int64


In [29]:
tmp = df['class'].value_counts().sort_index()
tmp 

0    67148
1     6575
2    14152
3     2280
4     2209
Name: class, dtype: int64

In [30]:
# Organize data into forlders (obsolete)

# create directories to organize the data
# folder_1 = 'data'
# print(folder_1)
# os.makedirs(folder_1, exist_ok=True)
# for level in index_class.keys():
#     folder_2 = os.path.join(folder_1,f'{level}_{index_class[level]}')
#     print(folder_2)
#     os.makedirs(folder_2, exist_ok=True)

# raw_data_path = pathlib.Path(RAW_DATA_FOLDER)
# raw_data_path

# for item in raw_data_path.glob('**/*') :
#     if item.is_file():
#         fname = item.name.split('.')[0]
#         image_class = df[df['filename'] == fname]['diagnosis'].values[0]
#         dest_folder = os.path.join('data',f'{image_class}_{index_class[image_class]}')
#         shutil.move(str(item), dest_folder)

In [31]:
# check for any filename other than jpg
# for item in raw_data_path.glob('**/*') :
#     if (item.is_file()) and (item.name.split('.')[-1] != 'jpg'):
#         print(item)

In [32]:
# Split data into train / validation / test dataset (obsolete)
# import splitfolders
# os.makedirs('organized_data', exist_ok=True)
# splitfolders.ratio('data', output = 'organized_data', seed = 25, ratio=(0.8,0.1,0.1))

# Read data directly from directory with help of the dataframe

In [33]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.data import Dataset
from tensorflow.data.experimental import AUTOTUNE

RAW_DATA_FOLDER = './resized_traintest15_train19/'
BATCH_SIZE = 8
SEED = 25
IMG_SIZE = 256
tf.random.set_seed(SEED)

In [34]:
def make_train_generator(shuffle_flag = True):
    '''
    Make an image data generator for training
    '''
    train_datagen = ImageDataGenerator(
        rescale= 1./255
    )
    train_image_data_flow = train_datagen.flow_from_dataframe(
        train_df,
        directory = RAW_DATA_FOLDER,
        x_col='filename',
        y_col='class',
        target_size=(IMG_SIZE, IMG_SIZE),
        class_mode='sparse',
        batch_size=BATCH_SIZE,
        shuffle=shuffle_flag, seed = SEED,
        validate_filenames=False
    )
    return train_image_data_flow

train_ds = Dataset.from_generator(
    make_train_generator, 
    output_signature=(
        tf.TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)
train_ds = train_ds.cache().prefetch(AUTOTUNE)

In [35]:
def make_validation_generator(shuffle_flag = False):
    '''
    Make an image data generator for validation
    '''
    validation_datagen = ImageDataGenerator(
        rescale= 1./255
    )
    validation_image_data_flow = validation_datagen.flow_from_dataframe(
        validation_df,
        directory = RAW_DATA_FOLDER,
        x_col='filename',
        y_col='class',
        target_size=(IMG_SIZE, IMG_SIZE),
        class_mode='sparse',
        batch_size=BATCH_SIZE,
        shuffle=shuffle_flag, seed = SEED,
        validate_filenames=False
    )
    return validation_image_data_flow

validation_ds = Dataset.from_generator(
    make_validation_generator,
    output_signature=(
        tf.TensorSpec(shape=(None, 256, 256, 3), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)
validation_ds = validation_ds.cache().prefetch(AUTOTUNE)

In [36]:
from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling2D
from tensorflow.keras.applications import VGG19
from tensorflow.keras.applications.vgg19 import preprocess_input

vgg19_model = VGG19(
    include_top=False, weights='imagenet', 
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
)
vgg19_model.trainable = False

In [38]:
# test the train_ds, validation_ds
image_batch, label_batch = next(iter(train_ds))
print(image_batch.shape)

x = preprocess_input(image_batch)
x = vgg19_model(x, training=False)#
x = GlobalAveragePooling2D()(x)
x = Dropout(0.2)(x)
x = Dense(5, activation = 'softmax')(x)
# model = tf.keras.Model(inputs, outputs)

# model.compile(
#     optimizer = 'adam', 
#     loss = 'categorical_crossentropy', 
#     metrics = ['accuracy']
# )

print(x.shape)
print(x)

Found 73890 non-validated image filenames belonging to 5 classes.
(8, 256, 256, 3)


NotFoundError: No algorithm worked! [Op:Conv2D]

In [39]:
tf.TensorSpec.from_tensor(image_batch) , tf.TensorSpec.from_tensor(label_batch)

(TensorSpec(shape=(8, 256, 256, 3), dtype=tf.float32, name=None),
 TensorSpec(shape=(8,), dtype=tf.float32, name=None))

In [40]:
inputs = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
x = preprocess_input(inputs)
x = vgg19_model(x, training=False)
x = GlobalAveragePooling2D()(x)
x = Dropout(0.2)(x)
outputs = Dense(5, activation = 'softmax')(x)
model = tf.keras.Model(inputs, outputs)

model.compile(
    optimizer = 'adam', 
    loss = 'sparse_categorical_crossentropy', 
    metrics = ['accuracy']
)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 256, 256, 3)]     0         
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 256, 256, 3)       0         
_________________________________________________________________
tf.nn.bias_add (TFOpLambda)  (None, 256, 256, 3)       0         
_________________________________________________________________
vgg19 (Functional)           (None, 8, 8, 512)         20024384  
_________________________________________________________________
global_average_pooling2d (Gl (None, 512)               0         
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 2565  

In [41]:
model.predict(image_batch), label_batch

NotFoundError:  No algorithm worked!
	 [[node model/vgg19/block1_conv1/Relu (defined at <ipython-input-41-76a90f9ab628>:1) ]] [Op:__inference_predict_function_927]

Function call stack:
predict_function


In [15]:
model.fit(
    train_ds,
    steps_per_epoch = 1,
    batch_size = BATCH_SIZE,
    validation_data = validation_ds,
    validation_steps = 1,
    epochs = 1
)

NameError: name 'model' is not defined

# Verify whether one patient has 2 different diagnosis on his/her eyes

In [None]:
data15 = df[df['year'] == 2015]

def get_patientID(row):
    tmp = row['filename'].split('_')
    return int(tmp[0])

def get_eye(row):
    tmp = row['filename'].split('_')
    return tmp[1]

    
data15.insert(3, 'patient_id', data15.apply(get_patientID, axis = 1), True)
data15.insert(4, 'eye', data15.apply(get_eye, axis = 1), True)
data15.head()

In [None]:
patient_eye_diagnose = pd.pivot_table(
    data15, 
    index = 'patient_id',
    columns = 'eye',
    values = 'diagnosis',
    aggfunc = 'sum').reset_index()

patient_eye_diagnose.head()

In [None]:
(patient_eye_diagnose['left'] != patient_eye_diagnose['right']).sum()

There are 5563 patients with different diagnosis on their eyes

In [None]:
a = patient_eye_diagnose[patient_eye_diagnose['left'] != patient_eye_diagnose['right']]['patient_id']
filter = data15['patient_id'].apply(lambda x: x in set(a))