In [6]:
import os
import sys
import pandas as pd
from pathlib import Path
from typing import List, Tuple, Optional, Dict
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from PIL import Image
from collections import Counter

In [7]:
pd.set_option('display.expand_frame_repr', False)  # Don't wrap to multiple lines
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [8]:
valid_file_extensions = ["jpg", "jpeg", "png"]

In [9]:
class CombinedDRDataSet(Dataset):

    def __init__(self, 
            root_directories: Dict[str, str],
            split: str="train", 
            img_transform: Optional[transforms.Compose] = None,
            label_transform: Optional[transforms.Compose] = None):
        
        self.root_directories = root_directories # dictionary containing dataset name : dataset path
        self.split = split 
        self.img_transform = img_transform
        self.label_transform = label_transform
        self.image_paths = []
        self.labels = []
        # to track which dataset each image comes from
        self.sources = []

        if "MFIDDR" in self.root_directories:
            self.load_MFIDDR()
        # if "idrid" in self.root_directories:
        #     self.load_IDRID()
        # if "deepdrid" in self.root_directories:
        #     self.load_DEEPDRID()

    def __len__(self) -> int:
        return len(self.image_paths)
    
    def get_labels(self):
        return self.labels
    
    def get_sources(self):
        return self.sources
    
    def load_MFIDDR(self):
        MFIDDR_ROOT = Path(self.root_directories["MFIDDR"])

        print(f"MFIDDR_ROOT: {MFIDDR_ROOT}")
        print(f"MFIDDR_ROOT exists: {MFIDDR_ROOT.exists()}")


        if self.split == "train":
            image_directory = MFIDDR_ROOT / "sample" / "train-examples"
            print(image_directory)
        else:
            image_directory = MFIDDR_ROOT / "sample" / "test-examples"

        if not image_directory.exists():
            print(f"ERROR: MFIDDR path was not found at {image_directory}")

        for image_file_path in os.listdir(image_directory):
            filename, file_extension = os.path.splitext(image_file_path)
            # removing the dot before the file extension as we already have defined and making lower
            file_extension = file_extension.lstrip('.').lower()
            if file_extension in valid_file_extensions:
                self.image_paths.append(str(image_directory / image_file_path))
                self.labels.append(filename)
                self.sources.append("MFIDDR")

    def load_labels_from_csv(self, csv_paths_dict: Dict[str, str]):
        if len(self.labels) == 0:
            self.labels = [None] * len(self.image_paths)
        
        for dataset_name, csv_path in csv_paths_dict.items():
            # checking if the csv file exists
            if not os.path.exists(csv_path):
                print(f"FileNotFoundError: CSV not found at {csv_path}")
                continue
        
        # loading the csv
        labels_df = pd.read_csv(csv_path)
        print(f"Loaded labels for {dataset_name}: {len(labels_df)} rows")

        print(labels_df.tail(5))


In [10]:
# In a Jupyter notebook __file__ is not defined, fall back to the current working directory


root_directories = {
    "MFIDDR": "D:/Zayaan/D_git/Eval-Foundation-Models-DiabeticRetinopathy-Grading/datasets/MFIDDR"
}

train_transformations = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225])
])

train_dataset = CombinedDRDataSet(root_directories=root_directories, split="train", img_transform=train_transformations)

# loading csv_paths
train_csv_paths = {
    "MFIDDR": f"{root_directories['MFIDDR']}/sample/train_fourpic_label.csv"
}

train_dataset.load_labels_from_csv(train_csv_paths)

print("TRAIN DATASET LENGTH:", train_dataset.__len__()) # 11/11/25 len is 0 for both hence there is error in data preprocessing 

print("Labels", train_dataset.get_labels())
print("Sources", train_dataset.get_sources())

MFIDDR_ROOT: D:\Zayaan\D_git\Eval-Foundation-Models-DiabeticRetinopathy-Grading\datasets\MFIDDR
MFIDDR_ROOT exists: True
D:\Zayaan\D_git\Eval-Foundation-Models-DiabeticRetinopathy-Grading\datasets\MFIDDR\sample\train-examples
Loaded labels for MFIDDR: 6462 rows
                       id  level  age                     id1                     id2                     id3                     id4
6457   3324_15236635_left      4   40   3324_15236635_left_05   3324_15236635_left_06   3324_15236635_left_07   3324_15236635_left_08
6458  3418_57689745_right      4   44  3418_57689745_right_01  3418_57689745_right_02  3418_57689745_right_03  3418_57689745_right_04
6459   3418_57689745_left      4   44   3418_57689745_left_05   3418_57689745_left_06   3418_57689745_left_07   3418_57689745_left_08
6460  3441_70906177_right      4   53  3441_70906177_right_01  3441_70906177_right_02  3441_70906177_right_03  3441_70906177_right_04
6461   3441_70906177_left      4   53   3441_70906177_left_05   3441