In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import string
# import re
# from nltk.corpus import stopwords
# from wordcloud import WordCloud, STOPWORDS
# from nltk.stem import SnowballStemmer

import os
import itertools
import cv2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , classification_report

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense,  BatchNormalization, Activation, Dropout  
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam , Adamax
from tensorflow.keras import regularizers

# import warnings
# warnings.filterwarnings("ignore")





2025-03-19 21:56:53.122111: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742446613.133820     587 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742446613.137038     587 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1742446613.145997     587 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742446613.146018     587 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1742446613.146019     587 computation_placer.cc:177] computation placer alr

In [2]:
# Load the dataset (Update the path if necessary)
dataset_path = "ocular-disease-recognition/full_df.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(dataset_path)

df = df[
    df.apply(lambda row: 
        os.path.exists(os.path.join('ocular-disease-recognition/preprocessed_images', row['Left-Fundus'])) and
        os.path.exists(os.path.join('ocular-disease-recognition/preprocessed_images', row['Right-Fundus'])), 
        axis=1
    )
].reset_index(drop=True)

In [3]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()
df['labels_encoded'] = label_encoder.fit_transform(df['labels'])
df['labels_encoded'] 

0       6
1       6
2       2
3       2
4       2
       ..
6063    2
6064    2
6065    2
6066    2
6067    4
Name: labels_encoded, Length: 6068, dtype: int64

In [4]:
len(df['labels_encoded'].unique())

8

In [5]:
df['decoded_labels'] = label_encoder.inverse_transform(df['labels_encoded'])
df['decoded_labels'] 

0       ['N']
1       ['N']
2       ['D']
3       ['D']
4       ['D']
        ...  
6063    ['D']
6064    ['D']
6065    ['D']
6066    ['D']
6067    ['H']
Name: decoded_labels, Length: 6068, dtype: object

In [6]:
train_dir = "ocular-disease-recognition/preprocessed_images"

file_paths = []
features = []

for root, dirs, files in os.walk(train_dir):
    for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
            
            image = cv2.imread(file_path)
            image = cv2.resize(image, (100, 100)) 
            avg_color = image.mean(axis=0).mean(axis=0)  
            features.append(avg_color)


features = np.array(features)

pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features)

kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(reduced_features)

eyes = pd.DataFrame({
    'file_path': file_paths,
    'cluster': clusters
})


In [7]:
# Map clusters to class labels
eyes['cluster'] = eyes['cluster'].replace({0: 'X', 1: 'N'})

# Split data
train_data, test = train_test_split(eyes, test_size=0.4, random_state=42, stratify=eyes['cluster'])
test, valid_data = train_test_split(test, test_size=0.5, random_state=42, stratify=test['cluster'])

print(f"Training set shape: {train_data.shape}")
print(f"Validation set shape: {valid_data.shape}")
print(f"Testing set shape: {test.shape}")

# Convert class labels to numerical values
train_data['label'] = train_data['cluster'].apply(lambda x: 0 if x == 'X' else 1)
valid_data['label'] = valid_data['cluster'].apply(lambda x: 0 if x == 'X' else 1)
test['label'] = test['cluster'].apply(lambda x: 0 if x == 'X' else 1)

# Function to parse and preprocess images
def parse_image(filename, label):
    image = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, (224, 224))
    image = image / 255.0  # Normalize
    return image, label

# Function to create TF dataset with batching and prefetching
def prepare_dataset(df, batch_size=8, shuffle=True):
    file_paths = df['file_path'].values
    labels = df['label'].values  # Convert labels to numerical

    dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))
    dataset = dataset.map(parse_image, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(df))
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

Training set shape: (3835, 2)
Validation set shape: (1279, 2)
Testing set shape: (1278, 2)


In [8]:
# Create datasets
batch_size = 2
train_dataset = prepare_dataset(train_data, batch_size=batch_size)
valid_dataset = prepare_dataset(valid_data, batch_size=batch_size, shuffle=False)
test_dataset = prepare_dataset(test, batch_size=batch_size, shuffle=False)

# Model Architecture
input_shape = (224, 224, 3)
model = Sequential([
    Conv2D(512, (3, 3), padding='same', activation='relu', input_shape=input_shape),
    Conv2D(512, (3, 3), padding='same', activation='relu'),
    Conv2D(512, (3, 3), padding='same', activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),

    Conv2D(256, (3, 3), padding='same', activation='relu'),
    Conv2D(256, (3, 3), padding='same', activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Conv2D(128, (3, 3), padding='same', activation='relu'),
    Conv2D(128, (3, 3), padding='same', activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Dropout(0.25),

    Conv2D(64, (3, 3), padding='same', activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),

    Flatten(),
    Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Changed to 1 neuron for binary classification
])

# Compile the model
model.compile(optimizer=Adamax(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    train_dataset,
    epochs=20,
    validation_data=valid_dataset,
    callbacks=[early_stopping]
)

I0000 00:00:1742446620.672345     587 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9571 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080, pci bus id: 0000:06:00.0, compute capability: 8.6
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20


I0000 00:00:1742446645.438171     708 service.cc:152] XLA service 0x7f50100061e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1742446645.438259     708 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2025-03-19 21:57:25.512680: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1742446645.868797     708 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1742446656.803478     708 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 135ms/step - accuracy: 0.7793 - loss: 1.0187 - val_accuracy: 0.9414 - val_loss: 0.1941
Epoch 2/20
[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 129ms/step - accuracy: 0.9163 - loss: 0.2213 - val_accuracy: 0.9437 - val_loss: 0.1424
Epoch 3/20
[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 129ms/step - accuracy: 0.9201 - loss: 0.2070 - val_accuracy: 0.9656 - val_loss: 0.1279
Epoch 4/20
[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 128ms/step - accuracy: 0.9335 - loss: 0.1749 - val_accuracy: 0.9547 - val_loss: 0.1215
Epoch 5/20
[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 126ms/step - accuracy: 0.9382 - loss: 0.1751 - val_accuracy: 0.9656 - val_loss: 0.1020
Epoch 6/20
[1m1918/1918[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 124ms/step - accuracy: 0.9499 - loss: 0.1412 - val_accuracy: 0.9578 - val_loss: 0.1158
Epo

In [9]:
# Evaluate the model
test_loss, test_acc = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_acc:.4f}")

[1m639/639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 54ms/step - accuracy: 0.9821 - loss: 0.0760
Test Accuracy: 0.9812
