### To Create the Data for adding to the S3 Bucket

In [1]:
# importing the required libraries
import numpy as np
import pandas as pd
from PIL import Image
import os
import cv2

pd.set_option("mode.copy_on_write", True)

Making a local copy of the custom functions so that they can be changed as needed.

In [2]:
def crop_image(img, threshold=15, resize_flag=False, desired_size=(512, 512)):
    # Convert image to numpy array
    img_np = np.array(img)

    # Get the shape of the image
    x_dim = img_np.shape[0]
    y_dim = img_np.shape[1]

    # Sum along the color axis (assuming the color axis is the third dimension)
    pixel_sums = img_np.sum(axis=2)

    # Sum along the x and y axes
    x_arr = pixel_sums.sum(axis=1)
    y_arr = pixel_sums.sum(axis=0)

    # Find the first and last indices where the sum exceeds the threshold
    x_start = np.where(x_arr > threshold * y_dim)[0][0]
    x_end = np.where(x_arr > threshold * y_dim)[0][-1]

    y_start = np.where(y_arr > threshold * x_dim)[0][0]
    y_end = np.where(y_arr > threshold * x_dim)[0][-1]

    # Crop the image
    new_img = img_np[x_start:x_end, y_start:y_end]

    # converting back to image
    new_img = Image.fromarray(new_img)

    # resizing the image
    if resize_flag:
        new_img = new_img.resize(desired_size)

    return new_img

In [3]:
def resize_image(img, desired_size=(512, 512)):
    # resizing the image
    new_img = img.resize(desired_size)
    return new_img

In [4]:
# luminosity and noise removal function
def luminosity_image(image_path, desired_size):
    # Load the image
    image = cv2.imread(image_path)

    # Convert to LAB color space
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)

    # Apply CLAHE to L channel
    clahe = cv2.createCLAHE(clipLimit=5.0, tileGridSize=(8, 8))
    l_eq = clahe.apply(l)

    # Merge back LAB channels
    lab_eq = cv2.merge((l_eq, a, b))
    enhanced_image = cv2.cvtColor(lab_eq, cv2.COLOR_LAB2BGR)

    # Image noise removal using Gaussian filter
    filtered_image = cv2.GaussianBlur(enhanced_image, (5, 5), 0)

    # Convert to PIL image
    filtered_image = Image.fromarray(filtered_image)

    # Resize the image
    filtered_image = resize_image(filtered_image, desired_size)

    # # Save the preprocessed image
    # image_name = os.path.basename(image_path)
    # output_path = os.path.join(output_dir, image_name)
    # cv2.imwrite(output_path, filtered_image)

    return filtered_image

In [5]:
def normalize_image(img):
    # Convert image to numpy array
    img_np = np.array(img)

    # Calculate mean and standard deviation (std) channel-wise
    mean_channels = np.mean(img_np, axis=(0, 1))
    std_channels = np.std(img_np, axis=(0, 1))

    # Normalize each channel separately
    normalized_image = np.zeros_like(img_np, dtype=np.float32)
    for channel in range(img_np.shape[2]):
        normalized_image[:, :, channel] = (
            img_np[:, :, channel] - mean_channels[channel]
        ) / std_channels[channel]

    # Scale values to be within [0, 255]
    normalized_image = (
        (normalized_image - np.min(normalized_image))
        / (np.max(normalized_image) - np.min(normalized_image))
        * 255
    )

    # Clip and return the normalized image
    normalized_image = np.clip(normalized_image, 0, 255)
    return normalized_image.astype(np.uint8)

### Loading the Base Data

In [6]:
source = "../../aws_s3/Raw_IDRID/"

In [7]:
# loading the mapping file

mapping = pd.read_csv(source + "train_split_key_recoded.csv")
mapping.sample(5)

Unnamed: 0,Image name,Retinopathy grade,Risk of macular edema,Retinopathy grade new,split
398,IDRiD_186,2,2,2,train
316,IDRiD_106,2,2,2,train
35,IDRiD_242,2,2,2,train
491,IDRiD_390,4,2,3,train
154,IDRiD_466,4,2,3,train


In [9]:
# validating that we have correct number of files in the sub-directories
test_size = len(mapping[mapping["split"] == "test"])
train_size = len(mapping[mapping["split"] == "train"])

# test files
assert len(os.listdir(source + "test/")) == test_size

# train files
assert len(os.listdir(source + "train/")) == train_size

### Resizing Data

here the files are simply resized to 512x512 and 224x224 and saved in the respective directories as numpy files.

In [14]:
base_directory = "../../aws_s3_temp/Raw/"
desired_sizes = [(512, 512), (224, 224)]
out_directories = ["Idrid_512_Raw", "Idrid_224_Raw"]

In [15]:
for i in range(2):
    desired_size = desired_sizes[i]
    out_directory = out_directories[i]

    # delete the directory if it already exists
    if os.path.exists(base_directory + out_directory):
        os.system("rm -rf " + base_directory + out_directory)
    os.mkdir(base_directory + out_directory)

    # creating the 4 arrays for train and test data
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for file in os.listdir(source + "train/"):
        image_path = source + "train/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "train"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # resize the image
        image = resize_image(image, desired_size)

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_train.append(img_np)
        y_train.append(label)

    # repeating process for test data
    for file in os.listdir(source + "test/"):
        image_path = source + "test/" + file

        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "test"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # resize the image
        image = resize_image(image, desired_size)

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_test.append(img_np)
        y_test.append(label)

    # validating the dimensions of the arrays
    assert len(X_train) == train_size
    assert len(y_train) == train_size
    assert len(X_test) == test_size
    assert len(y_test) == test_size

    # convertin the arrays to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # validating the dimensions of the arrays
    assert X_train.shape == (train_size, desired_size[0], desired_size[1], 3)
    assert y_train.shape == (train_size,)
    assert X_test.shape == (test_size, desired_size[0], desired_size[1], 3)
    assert y_test.shape == (test_size,)

    # saving the numpy arrays
    np.save(base_directory + out_directory + "/X_train.npy", X_train)
    np.save(base_directory + out_directory + "/y_train.npy", y_train)
    np.save(base_directory + out_directory + "/X_test.npy", X_test)
    np.save(base_directory + out_directory + "/y_test.npy", y_test)

### Image Enhancement and Noise Removal

In [16]:
base_directory = "../../aws_s3_temp/IENR/"
desired_sizes = [(512, 512), (224, 224)]
out_directories = ["Idrid_512_IENR", "Idrid_224_IENR"]

In [17]:
for i in range(2):
    desired_size = desired_sizes[i]
    out_directory = out_directories[i]

    # delete the directory if it already exists
    if os.path.exists(base_directory + out_directory):
        os.system("rm -rf " + base_directory + out_directory)
    os.mkdir(base_directory + out_directory)

    # creating the 4 arrays for train and test data
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for file in os.listdir(source + "train/"):
        image_path = source + "train/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "train"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        image = luminosity_image(image_path, desired_size)

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_train.append(img_np)
        y_train.append(label)

    # repeating process for test data
    for file in os.listdir(source + "test/"):
        image_path = source + "test/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "test"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        image = luminosity_image(image_path, desired_size)

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_test.append(img_np)
        y_test.append(label)

    # validating the dimensions of the arrays
    assert len(X_train) == train_size
    assert len(y_train) == train_size
    assert len(X_test) == test_size
    assert len(y_test) == test_size

    # convertin the arrays to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # validating the dimensions of the arrays
    assert X_train.shape == (train_size, desired_size[0], desired_size[1], 3)
    assert y_train.shape == (train_size,)
    assert X_test.shape == (test_size, desired_size[0], desired_size[1], 3)
    assert y_test.shape == (test_size,)

    # saving the numpy arrays
    np.save(base_directory + out_directory + "/X_train.npy", X_train)
    np.save(base_directory + out_directory + "/y_train.npy", y_train)
    np.save(base_directory + out_directory + "/X_test.npy", X_test)
    np.save(base_directory + out_directory + "/y_test.npy", y_test)

### Color Normalization

In [18]:
base_directory = "../../aws_s3_temp/CN/"
desired_sizes = [(512, 512), (224, 224)]
out_directories = ["Idrid_512_CN", "Idrid_224_CN"]

In [19]:
for i in range(2):
    desired_size = desired_sizes[i]
    out_directory = out_directories[i]

    # delete the directory if it already exists
    if os.path.exists(base_directory + out_directory):
        os.system("rm -rf " + base_directory + out_directory)
    os.mkdir(base_directory + out_directory)

    # creating the 4 arrays for train and test data
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for file in os.listdir(source + "train/"):
        image_path = source + "train/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "train"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # resize the image
        image = resize_image(image, desired_size)

        # do color normalization
        img_np = normalize_image(image)

        # Append the image and label to the training data
        X_train.append(img_np)
        y_train.append(label)

    # repeating process for test data
    for file in os.listdir(source + "test/"):
        image_path = source + "test/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "test"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # resize the image
        image = resize_image(image, desired_size)

        # do color normalization
        img_np = normalize_image(image)

        # Append the image and label to the training data
        X_test.append(img_np)
        y_test.append(label)

    # validating the dimensions of the arrays
    assert len(X_train) == train_size
    assert len(y_train) == train_size
    assert len(X_test) == test_size
    assert len(y_test) == test_size

    # convertin the arrays to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # validating the dimensions of the arrays
    assert X_train.shape == (train_size, desired_size[0], desired_size[1], 3)
    assert y_train.shape == (train_size,)
    assert X_test.shape == (test_size, desired_size[0], desired_size[1], 3)
    assert y_test.shape == (test_size,)

    # saving the numpy arrays
    np.save(base_directory + out_directory + "/X_train.npy", X_train)
    np.save(base_directory + out_directory + "/y_train.npy", y_train)
    np.save(base_directory + out_directory + "/X_test.npy", X_test)
    np.save(base_directory + out_directory + "/y_test.npy", y_test)

### Cropping

In [20]:
base_directory = "../../aws_s3_temp/Crop/"
desired_sizes = [(512, 512), (224, 224)]
out_directories = ["Idrid_512_Crop", "Idrid_224_Crop"]

In [21]:
for i in range(2):
    desired_size = desired_sizes[i]
    out_directory = out_directories[i]

    # delete the directory if it already exists
    if os.path.exists(base_directory + out_directory):
        os.system("rm -rf " + base_directory + out_directory)
    os.mkdir(base_directory + out_directory)

    # creating the 4 arrays for train and test data
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for file in os.listdir(source + "train/"):
        image_path = source + "train/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "train"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # crop the image
        image = crop_image(
            image, threshold=15, resize_flag=True, desired_size=desired_size
        )

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_train.append(img_np)
        y_train.append(label)

    # repeating process for test data
    for file in os.listdir(source + "test/"):
        image_path = source + "test/" + file
        file_name = file.split(".")[0]

        # validate that the file exists in the mapping, is Train File and is included
        assert mapping[mapping["Image name"] == file_name]["split"].values[0] == "test"

        # Get the label
        label = mapping[mapping["Image name"] == file_name][
            "Retinopathy grade new"
        ].values[0]

        # Load the image
        image = Image.open(image_path)

        # crop the image
        image = crop_image(
            image, threshold=15, resize_flag=True, desired_size=desired_size
        )

        # convert to numpy array
        img_np = np.array(image)

        # Append the image and label to the training data
        X_test.append(img_np)
        y_test.append(label)

    # validating the dimensions of the arrays
    assert len(X_train) == train_size
    assert len(y_train) == train_size
    assert len(X_test) == test_size
    assert len(y_test) == test_size

    # convertin the arrays to numpy arrays
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # validating the dimensions of the arrays
    assert X_train.shape == (train_size, desired_size[0], desired_size[1], 3)
    assert y_train.shape == (train_size,)
    assert X_test.shape == (test_size, desired_size[0], desired_size[1], 3)
    assert y_test.shape == (test_size,)

    # saving the numpy arrays
    np.save(base_directory + out_directory + "/X_train.npy", X_train)
    np.save(base_directory + out_directory + "/y_train.npy", y_train)
    np.save(base_directory + out_directory + "/X_test.npy", X_test)
    np.save(base_directory + out_directory + "/y_test.npy", y_test)