## Part 1 - First Dataset Preparation

In [None]:
import pandas as pd
import numpy as np
import cv2
import os
import random
import tensorflow as tf
import re

DATA_PATH = "/kaggle/input/ocular-disease-recognition-odir5k/preprocessed_images"
IMG_SIZE = 224

data = pd.read_csv("/kaggle/input/ocular-disease-recognition-odir5k/full_df.csv")

In [None]:
# A dictionary that maps short class names (abbreviations) to full class names
class_short2full = {
    "D": "Diabetic Retinopathy",  # 'D' maps to 'Diabetic Retinopathy'
    "G": "Glaucoma",  # 'G' maps to 'Glaucoma'
    "C": "Cataract",  # 'C' maps to 'Cataract'
    "A": "Age Related Macular Degeneration",  # 'A' maps to 'Age Related Macular Degeneration'
    "H": "Hypertension",  # 'H' maps to 'Hypertension'
    "M": "Myopia",  # 'M' maps to 'Myopia'
    "N": "Normal"  # 'N' maps to 'Normal'
}

# A dictionary that maps short class names (abbreviations) to their corresponding integer labels
class_dict = {
    "D": 0,  # 'D' is assigned the label 0
    "G": 1,  # 'G' is assigned the label 1
    "C": 2,  # 'C' is assigned the label 2
    "A": 3,  # 'A' is assigned the label 3
    "H": 4,  # 'H' is assigned the label 4
    "M": 5,  # 'M' is assigned the label 5
    "N": 6   # 'N' is assigned the label 6
}


In [None]:
# Create a new column 'class' in the DataFrame by extracting alphabetic characters from the 'labels' column
# The lambda function applies a regular expression to each value in 'labels', which extracts only the alphabetic characters
# The 'join' method concatenates the extracted characters with a space in between
data["class"] = data["labels"].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", x)))


In [None]:
# Define a list of classes to be excluded from processing
EXCLUDE_CLASSES = ["D", "N"]  # "D" for Diabetic Retinopathy and "N" for Normal

# Define the complete list of classes that the model can predict
CLASSES = ["D", "G", "C", "A", "H", "M", "N"]  # D: Diabetic Retinopathy, G: Glaucoma, C: Cataract, A: Age-related Macular Degeneration, H: Hypertension, M: Myopia, N: Normal

In [None]:
# Create a dictionary where each key is a class label and the value is a list of filenames corresponding to that class
dict_img_list = {
    class_: data.loc[data["class"] == class_]["filename"].values  # Filter the 'data' DataFrame for each class and retrieve the filenames
    for class_ in class_short2full.keys()  # Iterate over each class label from the 'class_short2full' dictionary keys
}

In [None]:
def create_dataset(img_list, class_label, max_images=None):
    # Initialize an empty list to store the dataset and a counter for the number of processed images
    dataset = []
    count = 0
    
    # Loop through each image in the provided image list
    for img in img_list:
        # If a maximum image count is specified, stop processing when the count is reached
        if max_images is not None and count >= max_images:
            break
        
        # Construct the full path to the image file
        image_path = os.path.join(DATA_PATH, img)
        # Read the image using OpenCV
        image = cv2.imread(image_path)
        
        # If the image couldn't be loaded (None), skip it
        if image is None:
            continue
        
        # Convert the image from BGR (OpenCV default) to RGB
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Resize the image to the target size (IMG_SIZE x IMG_SIZE)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
        # Append the processed image and its associated class label to the dataset
        dataset.append([np.array(image), class_label])
        # Increment the count of processed images
        count += 1
    
    return dataset

In [None]:
# Initialize an empty list to store the dataset
dataset = []
# Define the maximum number of images to be considered for each class (optional)
max_images_per_class = 1000

# Print message to indicate the start of the dataset building process
print("START building dataset")

# Loop through each class in the list of classes (CLASSES)
for i, class_ in enumerate(CLASSES):
    # Print the current class being processed along with its index
    print(f"[{i+1}/{len(CLASSES)}] adding {class_short2full[class_]} to dataset ...")
    
    # Check if the current class is in the list of classes to exclude (EXCLUDE_CLASSES)
    if class_ in EXCLUDE_CLASSES:
        # Assign the class label to the excluded class
        empty_label = class_dict[class_]
        # Append a dummy image (empty array of zeros) for the excluded class to the dataset
        dataset.append([np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8), empty_label])
    else:
        # Get the list of image filenames for the current class from dict_img_list
        img_list = dict_img_list[class_]
        # Retrieve the class label (index) for the current class
        class_label = class_dict[class_]
        # Call the create_dataset function to process the images and add them to the dataset
        dataset += create_dataset(img_list, class_label, max_images=None)

# Shuffle the dataset randomly to ensure varied order of images during training
random.shuffle(dataset)

print("COMPLETE building dataset")


In [None]:
len(dataset)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Parameters for data splitting and preprocessing
image_size = 224  # Size of each image (224x224 pixels)
num_classes = 7  
train_ratio = 0.7  # Ratio of data used for training
val_ratio = 0.15   # Ratio of data used for validation

# `train_x` holds the image data, reshaped to (num_images, image_size, image_size, 3)
train_x = np.array([i[0] for i in dataset]).reshape(-1, image_size, image_size, 3)

# `train_y` holds the corresponding class labels for each image
train_y = np.array([i[1] for i in dataset])

# Calculating the number of images for each split (train, val, and test)
num_images = len(train_x) 
num_train = int(num_images * train_ratio) 
num_val = int(num_images * val_ratio)    
num_test = num_images - num_train - num_val 

# Splitting the dataset into training data and the remaining data (validation + test)
# The `train_size` parameter ensures the training data split is the specified number of images
x_train, x_remaining, y_train, y_remaining = train_test_split(train_x, train_y, train_size=num_train, random_state=42)

# Further splitting the remaining data into validation and test sets
# `test_size=num_test` ensures the remaining data is split such that the test set gets the right number of images
x_val, x_test, y_val, y_test = train_test_split(x_remaining, y_remaining, test_size=num_test, random_state=42)

# Convert the labels to one-hot encoded format (categorical)
y_train = to_categorical(y_train, num_classes)  # One-hot encoding for training labels
y_val = to_categorical(y_val, num_classes)      # One-hot encoding for validation labels
y_test = to_categorical(y_test, num_classes)    # One-hot encoding for test labels

# Print the number of images in each data split (train, validation, and test)
print(f"Number of images - Train: {len(x_train)}, Validation: {len(x_val)}, Test: {len(x_test)}")


In [None]:
print("Shape of x_train:", x_train.shape)
print("Shape of x_val:", x_val.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of x_test:", x_test.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
# Define the number of output classes (7 in this case)
NUM_CLASSES = 7

# Convert the training labels to one-hot encoding with 7 output classes
y_train = to_categorical(y_train, num_classes=NUM_CLASSES)
# Convert the validation labels to one-hot encoding with 7 output classes
y_val = to_categorical(y_val, num_classes=NUM_CLASSES)
# Convert the test labels to one-hot encoding with 7 output classes
y_test = to_categorical(y_test, num_classes=NUM_CLASSES)


In [None]:
print("Shape of y_train:", y_train.shape)
print("Shape of y_val:", y_val.shape)
print("Shape of y_test:", y_test.shape)

In [None]:
# Convert one-hot encoded labels to class labels
y_train_labels = np.argmax(y_train, axis=1)  # Converts each one-hot encoded label back to its class index

# Count the occurrences of each class in the test set
test_class_counts = np.bincount(y_train_labels)  # Counts the number of occurrences for each class in the train labels

# Print the number of images for each class in the test set
for class_label, count in enumerate(test_class_counts):
    # Display the count of images for each class
    print(f"Class {class_label}: {count} images")

## Part 2 - Second Dataset Preparation

The same procedure as in part 1 was applied here, so it is not commented in detail.

In [None]:
data_v2 = pd.read_csv("/kaggle/input/combined-tsne-new-1/combined_tsne_new-1.csv")

In [None]:
data_v2["class"] = data_v2["labels"].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", x)))

In [None]:
EXCLUDE_CLASSES = ["G", "C", "A", "H", "M"]
CLASSES = ["D", "G", "C", "A", "H", "M", "N"]

In [None]:
dict_img_list = {
    class_: data_v2.loc[data_v2["class"]==class_]["filename"].values
    for class_ in class_short2full.keys()
}

In [None]:
dataset_v2 = []
max_images_per_class = 1000

print("START building dataset")
for i, class_ in enumerate(CLASSES):
    print(f"[{i+1}/{len(CLASSES)}] adding {class_short2full[class_]} to dataset ...")
    if class_ in EXCLUDE_CLASSES:
        empty_label = class_dict[class_]
        dataset.append([np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8), empty_label])
    else:
        img_list = dict_img_list[class_]
        class_label = class_dict[class_]
        dataset_v2 += create_dataset(img_list, class_label, max_images=None)

random.shuffle(dataset_v2)

print("COMPLETE building dataset")

In [None]:
len(dataset_v2)

In [None]:
# Parameters
image_size = 224
num_classes = 7
train_ratio = 0.7
val_ratio = 0.15

# Preparing predictors and target variables
train_x_v2 = np.array([i[0] for i in dataset_v2]).reshape(-1, image_size, image_size, 3)
train_y_v2 = np.array([i[1] for i in dataset_v2])

# Calculating the number of images for each split
num_images = len(train_x_v2)
num_train = int(num_images * train_ratio)
num_val = int(num_images * val_ratio)
num_test = num_images - num_train - num_val

# Splitting the dataset into train and remaining (validation + test)
x_train_v2, x_remaining_v2, y_train_v2, y_remaining_v2 = train_test_split(train_x_v2, train_y_v2, train_size=num_train, random_state=42)

# Further splitting the remaining data into validation and test
x_val_v2, x_test_v2, y_val_v2, y_test_v2 = train_test_split(x_remaining_v2, y_remaining_v2, test_size=num_test, random_state=42)

# Convert labels to categorical
y_train_v2 = to_categorical(y_train_v2, num_classes)
y_val_v2 = to_categorical(y_val_v2, num_classes)
y_test_v2 = to_categorical(y_test_v2, num_classes)

# Print the number of images in each split
print(f"Number of images - Train: {len(x_train_v2)}, Validation: {len(x_val_v2)}, Test: {len(x_test_v2)}")


In [None]:
print("Shape of x_train:", x_train_v2.shape)
print("Shape of x_val:", x_val_v2.shape)
print("Shape of y_val:", y_val_v2.shape)
print("Shape of y_train:", y_train_v2.shape)
print("Shape of x_test:", x_test_v2.shape)
print("Shape of y_test:", y_test_v2.shape)

In [None]:
y_train_v2 = to_categorical(y_train_v2, num_classes=NUM_CLASSES)
y_val_v2 = to_categorical(y_val_v2, num_classes=NUM_CLASSES)
y_test_v2 = to_categorical(y_test_v2, num_classes=NUM_CLASSES)

In [None]:
print("Shape of y_train:", y_train_v2.shape)
print("Shape of y_val:", y_val_v2.shape)
print("Shape of y_test:", y_test_v2.shape)

In [None]:
# Convert one-hot encoded labels to class labels
y_train_labels = np.argmax(y_train_v2, axis=1)

# Count the occurrences of each class in the test set
test_class_counts = np.bincount(y_train_labels)

# Print the number of images for each class in the test set
for class_label, count in enumerate(test_class_counts):
    print(f"Class {class_label}: {count} images")

# **Part 3 Concating**

In [None]:
import numpy as np

# Concatenate the existing training data with the new dataset's training data
combined_train_x = np.concatenate((x_train_v2, x_train), axis=0)
combined_train_y = np.concatenate((y_train_v2, y_train), axis=0)

# Concatenate the existing training data with the new dataset's training data
combined_val_x = np.concatenate((x_val_v2, x_val), axis=0)
combined_val_y = np.concatenate((y_val_v2, y_val), axis=0)

combined_test_x = np.concatenate((x_test_v2, x_test), axis=0)
combined_test_y = np.concatenate((y_test_v2, y_test), axis=0)

# Save the combined training data for future use
np.save('/kaggle/working/x_train.npy', combined_train_x)
np.save('/kaggle/working/y_train.npy', combined_train_y)

np.save('/kaggle/working/x_val.npy', combined_val_x)
np.save('/kaggle/working/y_val.npy', combined_val_y)

np.save('/kaggle/working/x_test.npy', combined_test_x)
np.save('/kaggle/working/y_test.npy', combined_test_y)

In [None]:
# Convert one-hot encoded labels to class labels
y_val_labels = np.argmax(combined_train_y, axis=1)

# Count the occurrences of each class in the test set
test_class_counts = np.bincount(y_val_labels)

# Print the number of images for each class in the test set
for class_label, count in enumerate(test_class_counts):
    print(f"Class {class_label}: {count} images")