In [None]:
import os
import re
import cv2
import math
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib as mpl
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from collections import Counter
from sklearn.manifold import TSNE
from matplotlib.patches import Rectangle
from matplotlib.colors import ListedColormap
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

## Loading Prepared Dataset

In [2]:
# Load the train, validation, and test datasets
x_train = np.load('/kaggle/input/5-class-train-test-val/x_train.npy')
y_train = np.load('/kaggle/input/5-class-train-test-val/y_train.npy')
x_val = np.load('/kaggle/input/5-class-train-test-val/x_val.npy')
y_val = np.load('/kaggle/input/5-class-train-test-val/y_val.npy')
x_test = np.load('/kaggle/input/5-class-train-test-val/x_test.npy')
y_test = np.load('/kaggle/input/5-class-train-test-val/y_test.npy')

In [None]:
# Dictionary to map short class labels to full class names
class_short2full = {
    "G": "Glaucoma",  # "G" is mapped to "Glaucoma"
    "C": "Cataract",  # "C" is mapped to "Cataract"
    "A": "Age Related Macular Degeneration",  # "A" is mapped to "Age Related Macular Degeneration"
    "H": "Hypertension",  # "H" is mapped to "Hypertension"
    "M": "Myopia"  # "M" is mapped to "Myopia"
}

# Dictionary to map short class labels to integer class indices
class_dict = {
    "G": 0,  # "G" is mapped to index 0
    "C": 1,  # "C" is mapped to index 1
    "A": 2,  # "A" is mapped to index 2
    "H": 3,  # "H" is mapped to index 3
    "M": 4   # "M" is mapped to index 4
}

In [5]:
# List of full class names
labels_long = ["Glaucoma", "Cataract", "AMD", "Hypertension", "Myopia"]

# List of short class labels (first letter of each full class name)
labels_short = [ll[0] for ll in labels_long]  # Extracts the first letter from each full class name

# Dictionary to map short class labels to full class names
class_short2full = {
    ls: ll  # Creates a mapping from short label (first letter) to full class name
    for ls, ll in zip(labels_short, labels_long)  # zips the short and long labels together
}

# Dictionary to map each class (short label) to a unique index (0-based)
class_dict = {class_: i for i, class_ in enumerate(class_short2full.keys())}

# Reverse dictionary to map each index back to its corresponding short class label
class_dict_rev = {v: k for k, v in class_dict.items()}

# Number of unique classes
NUM_CLASSES = len(class_dict)

## Training Part

In [None]:
# Define the image size
image_size = 224

# Import necessary layers from TensorFlow Keras
from tensorflow.keras.layers import Dropout, GlobalAveragePooling2D  
from tensorflow.keras.applications.vgg19 import VGG19  

# Load the VGG19 model pre-trained on ImageNet, excluding the top layer
vgg = VGG19(weights="imagenet", include_top=False, input_shape=(image_size, image_size, 3))

# Set all VGG19 layers as trainable
for layer in vgg.layers:
    layer.trainable = True  # Allow training of all layers in VGG19

# Import the Sequential API for model creation
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense

# Initialize the Sequential model
model = Sequential()
# Add the VGG19 model as a feature extractor (without the top layers)
model.add(vgg)

# Add Dropout layer to prevent overfitting (rate of 0.6)
model.add(Dropout(0.6))
# Add Global Average Pooling layer to reduce dimensionality of the output from VGG19
model.add(GlobalAveragePooling2D())

# Flatten the pooled features for dense layer processing
model.add(Flatten())

# Add BatchNormalization to standardize activations and improve training speed
model.add(tf.keras.layers.BatchNormalization())

# Add a dense layer with 256 neurons and ReLU activation
model.add(Dense(256, activation="relu"))
# Add another dense layer with 128 neurons and ReLU activation
# You can also add a Dropout layer here to further reduce overfitting if desired
model.add(Dense(128, activation="relu"))
# Add a dense layer with 64 neurons and ReLU activation
model.add(Dense(64, activation="relu"))
# Add the final output layer with 5 neurons for 5 classes, using softmax for multi-class classification
model.add(Dense(5, activation="softmax"))
# Display the summary of the model architecture
model.summary()

In [None]:
# Define the number of epochs for training
epochs = 35

# Import necessary callback modules from TensorFlow Keras
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Reduce learning rate on plateau callback: reduces LR when the validation loss plateaus
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    factor=0.75,  # Multiplies the learning rate by this factor when activated
    patience=10,  # Number of epochs to wait for improvement before reducing LR
    verbose=1,  # Print messages when learning rate is reduced
    min_delta=0.0001,  # Minimum change to qualify as an improvement
    cooldown=0,  # Number of epochs to wait before resuming normal learning rate
    min_lr=1e-6,  # Minimum learning rate, prevents LR from going below this value
)

# EarlyStopping callback: stops training if validation loss doesn't improve for 'patience' epochs
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    patience=epochs // 10,  # Number of epochs without improvement before stopping
    restore_best_weights=True,  # Restore model weights from the epoch with the best performance
    verbose=1,  # Print messages when early stopping is triggered
)

# ModelCheckpoint callback: saves the best model during training based on validation loss
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("/kaggle/working/ocir_model_initial.h5", save_best_only=True)

# List of callbacks to be used during model training
# callbacks = [checkpoint_cb, early_stopping_cb, reduce_lr]  # Uncomment this line if you want to use early stopping
callbacks = [checkpoint_cb, reduce_lr]  # Currently using only checkpoint and reduce_lr

In [None]:
# Define the metrics to be tracked during training
METRICS = [
    tf.keras.metrics.AUC(name="auc"),  # Area under the ROC curve metric
    tf.keras.metrics.BinaryAccuracy(name="acc"),  # Accuracy for binary classification (if used)
    tfa.metrics.F1Score(num_classes=NUM_CLASSES, average="weighted", name="f1"),  # F1 score, weighted average across all classes
    tf.keras.metrics.AUC(name="prc", curve="PR"),  # Area under the Precision-Recall curve
]

# Compile the model with Adam optimizer, categorical crossentropy loss, and the specified metrics
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),  # Adam optimizer with a small learning rate
    loss="categorical_crossentropy",  # Loss function for multi-class classification
    metrics=METRICS  # List of metrics to evaluate during training
)

In [None]:
# Train the model with callbacks
history = model.fit(x_train, y_train, batch_size=128, epochs=epochs, validation_data=(x_val, y_val), callbacks=callbacks)
print(history)

In [None]:
import matplotlib.pyplot as plt

# Retrieve training and validation accuracy and loss from history
training_accuracy = history.history['acc']  # Training accuracy values from the history object
validation_accuracy = history.history['val_acc']  # Validation accuracy values

training_loss = history.history['loss']  # Training loss values
validation_loss = history.history['val_loss']  # Validation loss values

training_auc = history.history['auc']  # Training AUC (Area Under the ROC Curve) values
validation_auc = history.history['val_auc']  # Validation AUC values

training_f1 = history.history['f1']  # Training F1 score values
validation_f1 = history.history['val_f1']  # Validation F1 score values

training_prc = history.history['prc']  # Training PRC (Precision-Recall Curve) values
validation_prc = history.history['val_prc']  # Validation PRC values

# Plot Accuracy
plt.plot(training_accuracy, label='Training Accuracy')
plt.plot(validation_accuracy, label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot Loss
plt.plot(training_loss, label='Training Loss')
plt.plot(validation_loss, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot Area Under the Curve
plt.plot(training_auc, label='Training AUC')
plt.plot(validation_auc, label='Validation AUC')
plt.title('Training and Validation AUC')
plt.xlabel('Epochs')
plt.ylabel('AUC')
plt.legend()
plt.show()

# Plot F1
plt.plot(training_f1, label='Training F1')
plt.plot(validation_f1, label='Validation F1')
plt.title('Training and Validation F1')
plt.xlabel('Epochs')
plt.ylabel('F1')
plt.legend()
plt.show()

# Plot Precision-Recall Curve
plt.plot(training_prc, label='Training PRC')
plt.plot(validation_prc, label='Validation PRC')
plt.title('Training and Validation PRC')
plt.xlabel('Epochs')
plt.ylabel('PRC')
plt.legend()
plt.show()

In [None]:
# Evaluate the model on the test data (x_test and y_test)
loss, auc, acc, f1, prc = model.evaluate(x_test, y_test)  # Model evaluation returns multiple metrics: loss, auc, accuracy, f1 score, and prc

# Print the accuracy of the model on the test data
print("Accuracy:", acc)

## Post Processes

In [None]:
# Set a random seed for reproducibility
SEED = 42

# Define color palette for different plot elements
COLORS = {
    "fig_bg": "#f6f5f5",              # Background color of the figure
    "plot_neut": "#ddbea9",           # Neutral color for plots
    "plot_text": "#343a40",           # Text color for plots
    
    # A list of colors for plotting (for different categories or groups)
    "cmap_color_list": ["#001219", "#005F73", "#0A9396", "#94D2BD", "#E9D8A6",
                        "#EE9B00", "#CA6702", "#BB3E03", "#AE2012", "#9B2226"],
    
    # Colors for different splits of data
    "split": {
        "train": "#264653",          # Training data color
        "val": "#2a9d8f",            # Validation data color
        "test": "#e9c46a"            # Test data color
    }
}

# Assign colors to each class from the 'class_short2full' dictionary
# 'class_short2full' contains the mapping from short class labels to full names
COLORS["class"] = {ls: c for ls, c in zip(class_short2full.keys(), COLORS["cmap_color_list"][:len(class_short2full.keys())])}

# Define color maps for plots, from a list of color codes
COLORS["cmap"] = mpl.colors.LinearSegmentedColormap.from_list("", COLORS["cmap_color_list"])
COLORS["cmap_pos"] = mpl.colors.LinearSegmentedColormap.from_list("", ["#F0F3F8", "#D1DBE9", "#A2B7D2", "#7493BC", "#6487B4", "#3D5A80"])

# List of colors associated with each class for easy access
colors_class_list = list(COLORS["class"].values())

# Font settings for various plot titles, labels, and texts
FONT_KW = {
    "plot_title" : {                # Font settings for main plot title
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "25",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_title_small" : {          # Font settings for smaller plot title
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "16",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_subtitle" : {             # Font settings for plot subtitle
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "12",               # Font size
        "style": "normal"           # Normal style
    },
    "subplot_title" : {             # Font settings for subplot title
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "18",               # Font size
        "style": "normal"           # Normal style
    },
    "subplot_title_small" : {       # Font settings for smaller subplot title
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "12",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_label" : {                # Font settings for plot labels
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "16",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_label_small" : {          # Font settings for smaller plot labels
        "fontname": "serif",        # Font type (serif)
        "weight": "bold",           # Bold text
        "size": "12",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_text" : {                 # Font settings for general plot text
        "fontname": "serif",        # Font type (serif)
        "weight": "normal",         # Normal weight
        "size": "12",               # Font size
        "style": "normal"           # Normal style
    },
    "plot_text_small" : {           # Font settings for smaller plot text
        "fontname": "serif",        # Font type (serif)
        "weight": "normal",         # Normal weight
        "size": "8",                # Font size
        "style": "normal"           # Normal style
    },
}


In [None]:
# Function to plot confusion matrix and its normalized version
def plot_confusion_matrix(y_true, y_pred, figsize=(16,6), cmap="Blues", suptitle=None):
    # Calculate confusion matrix and its normalized version
    cm = confusion_matrix(y_true, y_pred)  # Standard confusion matrix
    cm_norm = confusion_matrix(y_true, y_pred, normalize="true")  # Normalized confusion matrix
    
    # Create subplots for confusion matrix and normalized confusion matrix
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=figsize)
    fig.patch.set_facecolor(COLORS["fig_bg"])  # Set the background color for the figure

    # Plot the standard confusion matrix on the first axis (ax1)
    sns.heatmap(
        cm,  # Input confusion matrix
        annot=True,  # Annotate with the values
        annot_kws=FONT_KW["plot_text"],  # Font size and style for annotations
        fmt="d",  # Format as integers
        linewidths=3.0,  # Line width between cells
        linecolor=COLORS["fig_bg"],  # Line color for separation
        cmap=cmap,  # Color map
        cbar=False,  # No color bar
        square=True,  # Square aspect ratio
        xticklabels=[class_short2full[k] for k in class_dict.keys()],  # X-axis labels (class names)
        yticklabels=[class_short2full[k] for k in class_dict.keys()],  # Y-axis labels (class names)
        ax=ax1  # Plot on first subplot
    )

    # Customize ax1 title and labels
    ax1.set_title("Confusion Matrix", **FONT_KW["subplot_title_small"])
    ax1.set_xlabel("Predicted Labels", **FONT_KW["plot_label_small"])
    ax1.set_ylabel("True Labels", **FONT_KW["plot_label_small"])
    
    # Set background color and tick parameters for ax1
    ax1.set_facecolor(COLORS["fig_bg"])
    ax1.tick_params(axis="both", length=0)
    ax1.set_yticks(
        np.arange(len(ax1.get_yticklabels()))+0.5,  # Adjust ticks for better positioning
        [label.get_text() for label in ax1.get_yticklabels()],
        **FONT_KW["plot_label_small"]
    )
    ax1.set_xticks(
        np.arange(len(ax1.get_xticklabels()))+0.5,  # Adjust ticks for better positioning
        [label.get_text() for label in ax1.get_xticklabels()],
        **FONT_KW["plot_label_small"]
    )
    
    # Color class labels according to class colors
    for class_, i in class_dict.items():
        ax1.get_xticklabels()[i].set_color(COLORS["class"][class_])
        ax1.get_yticklabels()[i].set_color(COLORS["class"][class_])

    # Plot the normalized confusion matrix on the second axis (ax2)
    sns.heatmap(
        cm_norm,  # Input normalized confusion matrix
        annot=True,  # Annotate with the values
        annot_kws=FONT_KW["plot_text"],  # Font size and style for annotations
        fmt= ".0%" if np.all(np.allclose(cm_norm, cm_norm.astype(int))) else ".1%",  # Format as percentage
        linewidths=3.0,  # Line width between cells
        linecolor=COLORS["fig_bg"],  # Line color for separation
        cmap=cmap,  # Color map
        cbar=False,  # No color bar
        square=True,  # Square aspect ratio
        xticklabels=[class_short2full[k] for k in class_dict.keys()],  # X-axis labels (class names)
        yticklabels=[class_short2full[k] for k in class_dict.keys()],  # Y-axis labels (class names)
        ax=ax2  # Plot on second subplot
    )

    # Customize ax2 title and labels
    ax2.set_title("Confusion Matrix (Normalized)", **FONT_KW["subplot_title_small"])
    ax2.set_xlabel("Predicted Labels", **FONT_KW["plot_label_small"])
    ax2.set_ylabel("True Labels", **FONT_KW["plot_label_small"])
    
    # Set background color and tick parameters for ax2
    ax2.set_facecolor(COLORS["fig_bg"])
    ax2.tick_params(axis="both", length=0)
    ax2.set_yticks(
        np.arange(len(ax2.get_yticklabels()))+0.5,  # Adjust ticks for better positioning
        [label.get_text() for label in ax2.get_yticklabels()],
        **FONT_KW["plot_label_small"]
    )
    ax2.set_xticks(
        np.arange(len(ax2.get_xticklabels()))+0.5,  # Adjust ticks for better positioning
        [label.get_text() for label in ax2.get_xticklabels()],
        **FONT_KW["plot_label_small"]
    )
        
    # Color class labels according to class colors for ax2
    for class_, i in class_dict.items():
        ax2.get_xticklabels()[i].set_color(COLORS["class"][class_])
        ax2.get_yticklabels()[i].set_color(COLORS["class"][class_])
    
    # If a suptitle is provided, display it at the top of the figure
    if suptitle is not None:
        plt.suptitle(suptitle, y=0.98, **FONT_KW["plot_title_small"])

    plt.show()

In [None]:
# Predict class probabilities for the validation set and convert them to class labels
y_val_pred = np.argmax(model.predict(x_val), axis=1)

In [None]:
# Plot confusion matrix for the validation set
# The true labels are obtained by converting one-hot encoded labels to class labels (using np.argmax).
# The predicted labels are compared to the true labels to visualize the model's performance.

plot_confusion_matrix(
    np.argmax(y_val, axis=1),  # Convert one-hot encoded labels to class labels for true labels
    y_val_pred,  # Predicted labels from the model
    figsize=(15,4),  # Set the figure size for the plot
    cmap=COLORS["cmap_pos"],  # Set the color map for the plot
    suptitle="Model Performance (Validation)"  # Set the title of the plot
)

In [None]:
# Predict class probabilities for the test set
y_test_prob = model.predict(x_test)  # Get the predicted probabilities for each class

# Convert the predicted probabilities to class labels
y_test_pred = np.argmax(y_test_prob, axis=1)  # Get the class with the highest probability for each sample


In [None]:
# Plot the confusion matrix for the test set
plot_confusion_matrix(
    np.argmax(y_test, axis=1), y_test_pred,
    figsize=(15,4),
    cmap=COLORS["cmap_pos"],
    suptitle="Model Performance (Test)"
)

# Generate a classification report which includes precision, recall, and F1-score
report = classification_report(
    np.argmax(y_test, axis=1), y_test_pred,  # Actual labels and predicted labels
    target_names=[class_short2full[k] for k in class_dict.keys()]  # Class names mapped from short labels
)

# Compute the ROC AUC score for the test set (multiclass, one-vs-rest strategy)
test_roc_auc = roc_auc_score(np.argmax(y_test, axis=1), y_test_prob, multi_class='ovr')

print(report)
print(f"     roc auc       {np.round(test_roc_auc, 2)}")


In [None]:
import pickle  # Import the pickle module for loading serialized data

# Define the folder path where the pickle files are stored
folder_path = "/kaggle/working/"

# Get a list of file paths for all files ending with ".pkl" in the specified folder
filepaths = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".pkl")]

# Loop over each file path to load and process the pickle data
for filepath in filepaths:
    filename = os.path.basename(filepath)  # Extract the file name from the file path
    
    # Open and load the pickle data from the file
    with open(filepath, "rb") as f:
        data = pickle.load(f)
    
    # Extract the maximum value for all metrics except for loss/val_loss where minimum value is taken
    results = {
        key: max(values) if key not in ["val_loss", "loss"] else min(values)
        for key, values in data.items()
    }

    print(f"Results for {filename}:")
    for key, value in results.items():  # Loop over the results dictionary to print each metric and its value
        print(f" {key}: {value:.4f}")
    print("------\n")


In [None]:
import scipy.io as sio  # Import the scipy.io module for saving MATLAB .mat files

# Function to convert .pkl files in a directory to .mat files
def convert_pkl_to_mat(directory):
    # Loop through all files in the specified directory
    for filename in os.listdir(directory):
        # Check if the current file is a .pkl file
        if filename.endswith(".pkl"):
            filepath = os.path.join(directory, filename)  # Get the full path of the .pkl file
            
            # Open and load the .pkl file
            with open(filepath, "rb") as file:
                data = pickle.load(file)
            
            # Define the path for the output .mat file by changing the file extension to .mat
            mat_filepath = os.path.splitext(filepath)[0] + ".mat"
            
            # Save the loaded data to a .mat file
            sio.io.savemat(mat_filepath, {"data": data})
            
            # Print a message confirming the conversion
            print(f"Converted {filename} to {os.path.basename(mat_filepath)}")

# Entry point of the script
if __name__ == "__main__":
    dir_path = "/kaggle/working/"
    convert_pkl_to_mat(dir_path)


In [None]:
# Open the 'history.pkl' file and load the training history data using pickle
with open('history.pkl', 'rb') as file:
    history = pickle.load(file)

# Create a dictionary to organize the training history data into a structured format
data = {
    # Extract training and validation accuracy values from history
    'training_accuracy': history.history['acc'],
    'validation_accuracy': history.history['val_acc'],

    # Extract training and validation loss values from history
    'training_loss': history.history['loss'],
    'validation_loss': history.history['val_loss'],

    # Extract training and validation AUC (Area Under the Curve) values from history
    'training_auc': history.history['auc'],
    'validation_auc': history.history['val_auc'],  # AUC (Area Under the Curve)

    # Extract training and validation F1 score values from history
    'training_f1': history.history['f1'],
    'validation_f1': history.history['val_f1'],

    # Extract training and validation PRC (Precision-Recall Curve) values from history
    'training_prc': history.history['prc'],
    'validation_prc': history.history['val_prc'],  # PRC (Precision Recall Curve)
}

# Save the training history data as a .mat file using scipy's savemat function
sio.savemat('history.mat', {'data': data})


In [None]:
# Save the model
model.save("ocular_disease_model.h5")

print("Model saved successfully!")