To read the zipped image files and convert them all to a standard size.
once converted, they are stored in the relevant folder.

In [1]:
# importing all the required libraries
import numpy as np
import pandas as pd
import zipfile
import os
import shutil
from PIL import Image
import pickle

### Configurations
modify the below configurations to suit your needs, beyond this, the rest of the code should work as is.

In [2]:
# setting default configuration options
pd.set_option("mode.copy_on_write", True)
desired_size = (512, 512)  # Modify this if required

Specify the Path to the folders as per your system in case running this file

In [3]:
# Repo Paths, No need to change these as they are present in the repo and are relative paths
path_combine = "../02_Data/Extra/messidor_combine.parquet"  # path to the combined excel DF (present in repo)
path_mapping = "../02_Data/01_messidor_mapping.parquet"  # path to the mapping parquet file (present in repo)

In [4]:
# Absolute Paths, Change these to the paths in your system
path_source = "/Users/revanth/Downloads/Messidor/"  # path to the source folder (Regular Folder not Zip)
path_dest = "/Users/revanth/Documents/Messidor_Data/"  # path to the destination folder (Regular Folder not Zip)

No need to change the below code beyond this point

## Loading Mapping File

In [6]:
# Loading the Combined Mapping File
labels = pd.read_parquet(path_combine)
labels.sample(5)

Unnamed: 0,Image_ID,Department,Retinopathy_Grade,Risk_of_Macular_Edema,Data_Source,Include,Split
293,20051021_57798_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,True,Train
754,20060411_61196_0200_PP.tif,CHU de St Etienne,3,1,Messidor,True,Train
320,20051021_58316_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,True,Train
1190,20051202_54498_0400_PP.tif,LaTIM - CHU de BREST,2,2,Messidor,True,Train
341,20051205_35110_0400_PP.tif,LaTIM - CHU de BREST,3,0,Messidor,False,Train


In [7]:
# adding empty column to store the original image size
labels["Original_Size"] = None

## Identifying Files

In [8]:
# lsiting all the files in the path_source folder
files = os.listdir(path_source)
files

['Annotation_Base33.xls',
 'Annotation_Base32.xls',
 'Annotation_Base24.xls',
 'Annotation_Base31.xls',
 'Annotation_Base21.xls',
 '.DS_Store',
 'Annotation_Base34.xls',
 'Annotation_Base22.xls',
 'Annotation_Base23.xls',
 'Base11.zip',
 'Base12.zip',
 'Base13.zip',
 'Base14.zip',
 'Base24.zip',
 'Base31.zip',
 'resize_samples',
 'Base33.zip',
 'Base32.zip',
 'Base22.zip',
 'Base23.zip',
 'Base21.zip',
 'Base34.zip',
 'Annotation_Base12.xls',
 'Annotation_Base13.xls',
 'Annotation_Base11.xls',
 'Annotation_Base14.xls']

In [9]:
# filtering for only zip files
zip_files = [
    file for file in files if file.startswith("Base") and file.endswith(".zip")
]
zip_files.sort()
zip_files

['Base11.zip',
 'Base12.zip',
 'Base13.zip',
 'Base14.zip',
 'Base21.zip',
 'Base22.zip',
 'Base23.zip',
 'Base24.zip',
 'Base31.zip',
 'Base32.zip',
 'Base33.zip',
 'Base34.zip']

In [10]:
# we should have 12 zip files
assert len(zip_files) == 12

### Extracting the zip files

In [11]:
# creating a temporary directory to extract the files

# delete the temp directory if it already exists
if os.path.exists(path_source + "temp_dir/"):
    os.system("rm -rf " + path_source + "temp_dir/")
os.mkdir(path_source + "temp_dir/")
temp_dir = path_source + "temp_dir/"

In [12]:
# extracting the contents of the zip files to the temporary directory
for file in zip_files:
    with zipfile.ZipFile(path_source + file, "r") as zip_ref:
        zip_ref.extractall(temp_dir)
        print(f"Files from {file} extracted successfully")

Files from Base11.zip extracted successfully
Files from Base12.zip extracted successfully
Files from Base13.zip extracted successfully
Files from Base14.zip extracted successfully
Files from Base21.zip extracted successfully
Files from Base22.zip extracted successfully
Files from Base23.zip extracted successfully
Files from Base24.zip extracted successfully
Files from Base31.zip extracted successfully
Files from Base32.zip extracted successfully
Files from Base33.zip extracted successfully
Files from Base34.zip extracted successfully


In [13]:
# if temp_dir has sub-folders, extract the contents of the subfolders to the temp_dir

# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk(temp_dir):
    path = root.split(os.sep)

    for file in files:
        if not os.path.isdir(file):

            # move file from nested folder into the base folder
            shutil.move(os.path.join(root, file), os.path.join(temp_dir, file))

### cleanup

In [18]:
for root, dirs, files in os.walk(temp_dir):
    # Deleting the sub-folders
    for dir in dirs:
        shutil.rmtree(os.path.join(root, dir))

    # deleting all the excel files and DS_Store files
    for file in files:
        if file.endswith(".xls"):
            os.remove(os.path.join(root, file))
        if file == ".DS_Store":
            os.remove(os.path.join(root, file))

In [19]:
# check if all the files are loaded with the correct names
num_base = 0
unique_start = set()
num_non_base = 0

for name in os.listdir(temp_dir):
    if "Base" in name:
        num_base += 1
        unique_start.add(name.split("/")[0])
    else:
        num_non_base += 1
    if name not in labels["Image_ID"].values:
        print(f"Image {name} not found in the labels")

print(f"Number of Base Images: {num_base}")
print(f"Number of Non-Base Images: {num_non_base}")

Number of Base Images: 0
Number of Non-Base Images: 1200


In [20]:
# we should have 1200 non-base images and 0 base images
assert num_base == 0
assert num_non_base == 1200

In [21]:
# view images which are loaded with the incorrect names
unique_start

set()

## Image Processing

In [26]:
# check if the target folder exists, if not create it
if not os.path.exists(path_dest):
    os.mkdir(path_dest)
    print("Destination folder created successfully")
else:
    print("Destination folder already exists")

# Deleting the exisiting files in the destination folder if any
if os.path.exists(path_dest + "Resized/"):
    os.system("rm -rf " + path_dest + "Resized/")
    print("Existing Resized folder deleted successfully")
os.mkdir(path_dest + "Resized/")


# Deleting the exisiting files in the destination folder if any
if os.path.exists(path_dest + "Raw/"):
    os.system("rm -rf " + path_dest + "Raw/")
    print("Existing Raw folder deleted successfully")
os.mkdir(path_dest + "Raw/")

# creating sub-folders for Train and Test
os.mkdir(path_dest + "Resized/" + "Train/")
os.mkdir(path_dest + "Resized/" + "Test/")
print("Train and Test folders created successfully in Resized folder")

# creating sub-folders for Train and Test in Raw folder
os.mkdir(path_dest + "Raw/" + "Train/")
os.mkdir(path_dest + "Raw/" + "Test/")
print("Train and Test folders created successfully in Raw folder")

path_train_resized = path_dest + "Resized/" + "Train/"
path_test_resized = path_dest + "Resized/" + "Test/"
path_train_raw = path_dest + "Raw/" + "Train/"
path_test_raw = path_dest + "Raw/" + "Test/"

Destination folder already exists
Existing Resized folder deleted successfully
Existing Raw folder deleted successfully
Train and Test folders created successfully in Resized folder
Train and Test folders created successfully in Raw folder


In [27]:
# go through the temp_dir and process the images and save them in the target directory
nums = 0  # to keep track of the number of images processed
err_num = 0  # to keep track of the number of images that caused an error

path = temp_dir
for files in os.listdir(path):
    if files.endswith(".tif"):
        img = Image.open(path + files)
        img_split = None  # so store if the image is test or train
        try:
            # add original size to the labels dataframe
            idx = labels[labels["Image_ID"] == files].index[0]
            labels.at[idx, "Original_Size"] = img.size

            # if the image is not marked as iclude, skip it
            if labels.at[idx, "Include"] == 0:
                continue
            # get the split of the image
            img_split = labels.at[idx, "Split"]

            # save the raw image in the respective folder
            if img_split == "Train":
                img.save(path_train_raw + files)
            elif img_split == "Test":
                img.save(path_test_raw + files)

            # resize the image
            img = img.resize(desired_size)

            # save the image
            if img_split == "Train":
                img.save(path_train_resized + files)
            elif img_split == "Test":
                img.save(path_test_resized + files)

            nums += 1

        except:
            print(f"Error processing {files}")
            err_num += 1
            if err_num > 10:
                # breaking so that we can fix the error before trying more images
                print("Too many errors!, breaking")
                break
            continue
    if nums % 100 == 0:
        print(f"processed {nums} images")
print(f"Processed {nums} images with {err_num} errors")

processed 100 images
processed 200 images
processed 300 images
processed 400 images
processed 500 images
processed 600 images
processed 700 images
processed 800 images
processed 900 images
processed 1000 images
processed 1100 images
Processed 1187 images with 0 errors


## Sanity Checks

In [28]:
# We should have 1200 images
assert nums == labels["Include"].sum()

# We should have 0 errors
assert err_num == 0

In [29]:
# Train folder should have same number of images as the Train split
assert len(os.listdir(path_train_resized)) == len(
    labels[(labels["Split"] == "Train") & labels["Include"]]
)
assert len(os.listdir(path_train_raw)) == len(
    labels[(labels["Split"] == "Train") & labels["Include"]]
)

# Test folder should have same number of images as the Test split
assert len(os.listdir(path_test_resized)) == len(
    labels[(labels["Split"] == "Test") & labels["Include"]]
)
assert len(os.listdir(path_test_raw)) == len(
    labels[(labels["Split"] == "Test") & labels["Include"]]
)

# Saving and Cleanup

In [30]:
# viewing the labels dataframe
labels.sample(5)

Unnamed: 0,Image_ID,Department,Retinopathy_Grade,Risk_of_Macular_Edema,Data_Source,Include,Split,Original_Size
759,20060411_61478_0200_PP.tif,CHU de St Etienne,3,1,Messidor,True,Train,"(1440, 960)"
841,20051202_55484_0400_PP.tif,LaTIM - CHU de BREST,2,0,Messidor,True,Train,"(1440, 960)"
1092,20051020_64703_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,True,Train,"(2240, 1488)"
56,20051214_56392_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,True,Train,"(2240, 1488)"
829,20060410_47351_0200_PP.tif,CHU de St Etienne,2,0,Messidor,True,Train,"(1440, 960)"


In [32]:
# saving the labels dataframe as a parquet file
labels.to_parquet(path_mapping, index=False)

# saving labels df as excel and parquet in both sub-folders
labels.to_parquet(path_dest + "/Raw/" + "messidor_mapping.parquet", index=False)
labels.to_excel(path_dest + "/Raw/" + "messidor_mapping.xlsx", index=False)
labels.to_parquet(path_dest + "/Resized/" + "messidor_mapping.parquet", index=False)
labels.to_excel(path_dest + "/Resized/" + "messidor_mapping.xlsx", index=False)

In [33]:
# cleaning up the temporary directory
shutil.rmtree(temp_dir)