In [None]:
# Importing necessary libraries for ResNet combined synthetic and real data analysis
import pandas as pd
import numpy as np
import cv2
import os
import random
import tensorflow as tf
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Setting the path to the directory containing preprocessed images
DATA_PATH = "/kaggle/input/ocular-disease-recognition-odir5k/preprocessed_images"
IMG_SIZE = 224

# Loading the dataset from a CSV file into a pandas DataFrame
data = pd.read_csv("/kaggle/input/ocular-disease-recognition-odir5k/full_df.csv")


In [None]:
# Mapping of short class labels to their full descriptive names
class_short2full = {
    "G": "Glaucoma",  # Short label 'G' represents Glaucoma
    "C": "Cataract",  # Short label 'C' represents Cataract
    "A": "Age Related Macular Degeneration",  # Short label 'A' represents ARMD
    "H": "Hypertension",  # Short label 'H' represents Hypertension
    "M": "Myopia"  # Short label 'M' represents Myopia
}

# Mapping of short class labels to numerical indices for machine learning models
class_dict = {
    "G": 0,  # Glaucoma is assigned index 0
    "C": 1,  # Cataract is assigned index 1
    "A": 2,  # ARMD is assigned index 2
    "H": 3,  # Hypertension is assigned index 3
    "M": 4   # Myopia is assigned index 4
}

# Data preprocessing and converting class labels
data["class"] = data["labels"].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", x)))

CLASSES = ["G", "C", "A", "H", "M"]


## Part 1 - Real Dataset Preparation


In [None]:
# Create a dictionary mapping each class to a list of image filenames
dict_img_list = {
    class_: data.loc[data["class"] == class_]["filename"].values
    for class_ in class_short2full.keys()
}

def create_dataset(img_list, class_label, max_images=None):
    dataset = []
    count = 0
    
    for img in img_list:
        if max_images is not None and count >= max_images:
            break
        
        image_path = os.path.join(DATA_PATH, img)
        image = cv2.imread(image_path)
        
        if image is None:
            continue
        
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
        dataset.append([np.array(image), class_label])
        count += 1
    
    return dataset


In [None]:
# Initialize an empty list to store the dataset
dataset = []

# Print message to indicate the start of the dataset building process
print("START building real dataset")

# Loop through each class in the list of classes (CLASSES)
for i, class_ in enumerate(CLASSES):
    # Print the current class being processed along with its index
    print(f"[{i+1}/{len(CLASSES)}] adding {class_short2full[class_]} to dataset ...")
    
    # Get the list of image filenames for the current class from dict_img_list
    img_list = dict_img_list[class_]
    # Retrieve the class label (index) for the current class
    class_label = class_dict[class_]
    # Call the create_dataset function to process the images and add them to the dataset
    dataset += create_dataset(img_list, class_label, max_images=None)

# Shuffle the dataset randomly to ensure varied order of images during training
random.shuffle(dataset)

print("COMPLETE building real dataset")
print(f"Total real dataset size: {len(dataset)}")


## Part 2 - Synthetic Dataset Preparation (t-SNE Selected)


In [None]:
# Load the t-SNE selected dataset (similar to the original approach)
data_v2 = pd.read_csv("/kaggle/input/combined-tsne-new-1/combined_tsne_new-1.csv")

# Apply the same preprocessing
data_v2["class"] = data_v2["labels"].apply(lambda x: " ".join(re.findall("[a-zA-Z]+", x)))

# Create dictionary for synthetic data
dict_img_list_v2 = {
    class_: data_v2.loc[data_v2["class"]==class_]["filename"].values
    for class_ in class_short2full.keys()
}

# Build synthetic dataset
dataset_v2 = []

print("START building synthetic dataset")
for i, class_ in enumerate(CLASSES):
    print(f"[{i+1}/{len(CLASSES)}] adding {class_short2full[class_]} to synthetic dataset ...")
    img_list = dict_img_list_v2[class_]
    class_label = class_dict[class_]
    dataset_v2 += create_dataset(img_list, class_label, max_images=None)

random.shuffle(dataset_v2)

print("COMPLETE building synthetic dataset")
print(f"Total synthetic dataset size: {len(dataset_v2)}")


## Part 3 - Combining Real and Synthetic Data


In [None]:
# Concatenate the real and synthetic datasets
combined_dataset = dataset + dataset_v2

print(f"Combined dataset size: {len(combined_dataset)}")
print(f"Real data: {len(dataset)} samples")
print(f"Synthetic data: {len(dataset_v2)} samples")

# Shuffle the combined dataset
random.shuffle(combined_dataset)

# Parameters for data splitting
image_size = 224
num_classes = 5
train_ratio = 0.7
val_ratio = 0.15

# Preparing predictors and target variables
train_x = np.array([i[0] for i in combined_dataset]).reshape(-1, image_size, image_size, 3)
train_y = np.array([i[1] for i in combined_dataset])

# Calculating the number of images for each split
num_images = len(train_x)
num_train = int(num_images * train_ratio)
num_val = int(num_images * val_ratio)
num_test = num_images - num_train - num_val

# Splitting the dataset into train and remaining (validation + test)
x_train, x_remaining, y_train, y_remaining = train_test_split(train_x, train_y, train_size=num_train, random_state=42)

# Further splitting the remaining data into validation and test
x_val, x_test, y_val, y_test = train_test_split(x_remaining, y_remaining, test_size=num_test, random_state=42)

# Convert labels to categorical
y_train = to_categorical(y_train, num_classes)
y_val = to_categorical(y_val, num_classes)
y_test = to_categorical(y_test, num_classes)

# Print the number of images in each split
print(f"Number of images - Train: {len(x_train)}, Validation: {len(x_val)}, Test: {len(x_test)}")


## Part 4 - ResNet50 Model Training on Combined Dataset


In [None]:
# Define the image size
image_size = 224

# Import necessary layers from TensorFlow Keras
from tensorflow.keras.layers import Dropout, GlobalAveragePooling2D  
from tensorflow.keras.applications import ResNet50  
import tensorflow_addons as tfa

# Load the ResNet50 model pre-trained on ImageNet, excluding the top layer
resnet = ResNet50(weights="imagenet", include_top=False, input_shape=(image_size, image_size, 3))

# Set all ResNet50 layers as trainable
for layer in resnet.layers:
    layer.trainable = True

# Import the Sequential API for model creation
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense

# Initialize the Sequential model
model = Sequential()
# Add the ResNet50 model as a feature extractor (without the top layers)
model.add(resnet)

# Add Dropout layer to prevent overfitting (rate of 0.5)
model.add(Dropout(0.5))
# Add Global Average Pooling layer to reduce dimensionality of the output from ResNet50
model.add(GlobalAveragePooling2D())

# Flatten the pooled features for dense layer processing
model.add(Flatten())

# Add BatchNormalization to standardize activations and improve training speed
model.add(tf.keras.layers.BatchNormalization())

# Add a dense layer with 512 neurons and ReLU activation
model.add(Dense(512, activation="relu"))
# Add another dense layer with 256 neurons and ReLU activation
model.add(Dense(256, activation="relu"))
# Add a dense layer with 128 neurons and ReLU activation
model.add(Dense(128, activation="relu"))
# Add the final output layer with 5 neurons for 5 classes, using softmax for multi-class classification
model.add(Dense(5, activation="softmax"))

# Display the summary of the model architecture
model.summary()


In [None]:
# Define the metrics to be tracked during training
METRICS = [
    tf.keras.metrics.AUC(name="auc"),  # Area under the ROC curve metric
    tf.keras.metrics.CategoricalAccuracy(name="acc"),  # Accuracy for categorical classification
    tfa.metrics.F1Score(num_classes=5, average="weighted", name="f1"),  # F1 score, weighted average across all classes
    tf.keras.metrics.AUC(name="prc", curve="PR"),  # Area under the Precision-Recall curve
]

# Compile the model with Adam optimizer, categorical crossentropy loss, and the specified metrics
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),  # Adam optimizer with a learning rate
    loss="categorical_crossentropy",  # Loss function for multi-class classification
    metrics=METRICS  # List of metrics to evaluate during training
)

# Define the number of epochs for training
epochs = 50

# Import necessary callback modules from TensorFlow Keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Reduce learning rate on plateau callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    factor=0.5,  # Multiplies the learning rate by this factor when activated
    patience=15,  # Number of epochs to wait for improvement before reducing LR
    verbose=1,  # Print messages when learning rate is reduced
    min_delta=0.0001,  # Minimum change to qualify as an improvement
    cooldown=0,  # Number of epochs to wait before resuming normal learning rate
    min_lr=1e-7,  # Minimum learning rate, prevents LR from going below this value
)

# EarlyStopping callback
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=epochs // 5,  # Number of epochs without improvement before stopping
    restore_best_weights=True,  # Restore model weights from the epoch with the best performance
    verbose=1,  # Print messages when early stopping is triggered
)

# ModelCheckpoint callback
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("/kaggle/working/resnet_combined_model.h5", save_best_only=True)

# List of callbacks to be used during model training
callbacks = [checkpoint_cb, early_stopping_cb, reduce_lr]

# Train the model with callbacks
history = model.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_data=(x_val, y_val), callbacks=callbacks)
print(history)


In [None]:
# Save the combined training data for future use
np.save('/kaggle/working/x_train_combined.npy', x_train)
np.save('/kaggle/working/y_train_combined.npy', y_train)

np.save('/kaggle/working/x_val_combined.npy', x_val)
np.save('/kaggle/working/y_val_combined.npy', y_val)

np.save('/kaggle/working/x_test_combined.npy', x_test)
np.save('/kaggle/working/y_test_combined.npy', y_test)

print("Combined datasets saved successfully!")
