To read the zipped image files and convert them all to a standard size.
once converted, they are converted to a numpy array and saved as a parquet file.

In [1]:
# importing all the required libraries
import numpy as np
import pandas as pd
import zipfile
import os
import shutil
from PIL import Image
import pickle

In [2]:
# setting default configuration options
pd.set_option("mode.copy_on_write", True)

## Identifying Files

In [3]:
path = "/Users/revanth/Downloads/Messidor/"  # update this if required

In [4]:
# lsiting all the files in the directory
files = os.listdir(path)
files

['Annotation_Base33.xls',
 'Annotation_Base32.xls',
 'Annotation_Base24.xls',
 'Annotation_Base31.xls',
 'Annotation_Base21.xls',
 '.DS_Store',
 'Annotation_Base34.xls',
 'Annotation_Base22.xls',
 'Annotation_Base23.xls',
 'Base11.zip',
 'Base12.zip',
 'Base13.zip',
 'Base14.zip',
 'Base24.zip',
 'Base31.zip',
 'Base33.zip',
 'Base32.zip',
 'Base22.zip',
 'Base23.zip',
 'Base21.zip',
 'Base34.zip',
 'Annotation_Base12.xls',
 'Annotation_Base13.xls',
 'Annotation_Base11.xls',
 'Annotation_Base14.xls',
 'temp_dir']

In [5]:
# filtering for only zip files
zip_files = [
    file for file in files if file.startswith("Base") and file.endswith(".zip")
]
zip_files.sort()
zip_files

['Base11.zip',
 'Base12.zip',
 'Base13.zip',
 'Base14.zip',
 'Base21.zip',
 'Base22.zip',
 'Base23.zip',
 'Base24.zip',
 'Base31.zip',
 'Base32.zip',
 'Base33.zip',
 'Base34.zip']

### Extracting the zip files

In [8]:
# creating a temporary directory to extract the files

# delete the temp directory if it already exists
if os.path.exists(path + "temp_dir/"):
    os.system("rm -rf " + path + "temp_dir/")
os.mkdir(path + "temp_dir/")
temp_dir = path + "temp_dir/"

In [9]:
# extracting the contents of the zip files to the temporary directory
for file in zip_files:
    with zipfile.ZipFile(path + file, "r") as zip_ref:
        zip_ref.extractall(temp_dir)
        print(f"{file} extracted successfully")

Base11.zip extracted successfully
Base12.zip extracted successfully
Base13.zip extracted successfully
Base14.zip extracted successfully
Base21.zip extracted successfully
Base22.zip extracted successfully
Base23.zip extracted successfully
Base24.zip extracted successfully
Base31.zip extracted successfully
Base32.zip extracted successfully
Base33.zip extracted successfully
Base34.zip extracted successfully


In [10]:
# if temp_dir has sub-folders, extract the contents of the subfolders to the temp_dir

# traverse root directory, and list directories as dirs and files as files
for root, dirs, files in os.walk(temp_dir):
    path = root.split(os.sep)

    for file in files:
        if not os.path.isdir(file):

            # move file from nested folder into the base folder
            shutil.move(os.path.join(root, file), os.path.join(temp_dir, file))

In [13]:
for root, dirs, files in os.walk(temp_dir):
    # Deleting the sub-folders
    for dir in dirs:
        shutil.rmtree(os.path.join(root, dir))

    # deleting all the excel files
    for file in files:
        if file.endswith(".xls"):
            os.remove(os.path.join(root, file))

## Image Loading

In [15]:
# empty dict to store the extracted images
images = {}
desired_size = (1440, 960)  # Modify this if required
nums = 0
path = temp_dir
for files in os.listdir(path):
    if files.endswith(".tif"):
        img = Image.open(path + files)
        images[files] = {
            "Image_ID": files,
            "Original_Size": img.size,
            "Image": img.resize(desired_size),
        }
        nums += 1
    if nums % 100 == 0:
        print(f"processed {nums} images")

processed 100 images
processed 200 images
processed 300 images
processed 400 images
processed 500 images
processed 600 images
processed 700 images
processed 800 images
processed 900 images
processed 1000 images
processed 1100 images
processed 1200 images


## Sanity Checks

In [16]:
# We should ahve 1200 images
assert len(images) == 1200

In [17]:
# check if all of them are loaded with the correct names
num_base = 0
unique_start = set()
num_non_base = 0

for name in images.keys():
    if "Base" in name:
        num_base += 1
        unique_start.add(name.split("/")[0])
    else:
        num_non_base += 1

print(f"Number of Base Images: {num_base}")
print(f"Number of Non-Base Images: {num_non_base}")

Number of Base Images: 0
Number of Non-Base Images: 1200


In [18]:
# view images which are loaded with the incorrect names
unique_start

set()

### Image Processing

In [19]:
# convert the list of images to a dataframe
df = pd.DataFrame(images).T
df.reset_index(drop=True, inplace=True)
df.sample(5)

Unnamed: 0,Image_ID,Original_Size,Image
142,20051117_37051_0400_PP.tif,"(1440, 960)",<PIL.Image.Image image mode=RGB size=1440x960 ...
830,20051213_62251_0100_PP.tif,"(2240, 1488)",<PIL.Image.Image image mode=RGB size=1440x960 ...
845,20060410_40846_0200_PP.tif,"(1440, 960)",<PIL.Image.Image image mode=RGB size=1440x960 ...
624,20060523_48990_0100_PP.tif,"(2240, 1488)",<PIL.Image.Image image mode=RGB size=1440x960 ...
1057,20051205_33006_0400_PP.tif,"(1440, 960)",<PIL.Image.Image image mode=RGB size=1440x960 ...


In [20]:
# cleaning the names
def clean_name(name):
    if "Base" in name:
        return name.split("/")[1]
    return name


df["Image_ID"] = df["Image_ID"].apply(clean_name)

In [21]:
# viewing count of images with different sizes
df["Original_Size"].value_counts()

Original_Size
(1440, 960)     588
(2240, 1488)    400
(2304, 1536)    212
Name: count, dtype: int64

In [22]:
# converting image to numpy array
df["Image"] = df["Image"].apply(lambda x: np.array(x))

In [23]:
# merging with the labels
labels = pd.read_parquet("../02_Data/Extra/messidor_mapping.parquet")
labels.sample(5)

Unnamed: 0,Image_ID,Department,Retinopathy_Grade,Risk_of_Macular_Edema,Data_Source
7,20051202_37199_0400_PP.tif,LaTIM - CHU de BREST,1,0,Messidor
797,20060412_52351_0200_PP.tif,CHU de St Etienne,0,0,Messidor
996,20060523_49859_0100_PP.tif,Service Ophtalmologie Lariboisière,3,0,Messidor
528,20051205_58458_0400_PP.tif,LaTIM - CHU de BREST,0,0,Messidor
995,20060523_49809_0100_PP.tif,Service Ophtalmologie Lariboisière,3,2,Messidor


In [24]:
# merging the labels
df_final = pd.merge(
    df, labels, on="Image_ID", how="inner", validate="1:1", indicator=True
)

In [25]:
df_final["_merge"].value_counts()

_merge
both          1200
left_only        0
right_only       0
Name: count, dtype: int64

In [26]:
df_final.sample(5)

Unnamed: 0,Image_ID,Original_Size,Image,Department,Retinopathy_Grade,Risk_of_Macular_Edema,Data_Source,_merge
962,20051020_44261_0100_PP.tif,"(2240, 1488)","[[[1, 0, 1], [2, 0, 2], [2, 0, 2], [1, 0, 1], ...",Service Ophtalmologie Lariboisière,0,0,Messidor,both
693,20060412_61151_0200_PP.tif,"(1440, 960)","[[[0, 1, 6], [1, 0, 5], [1, 0, 2], [0, 0, 0], ...",CHU de St Etienne,2,0,Messidor,both
628,20060411_58413_0200_PP.tif,"(1440, 960)","[[[2, 0, 3], [1, 1, 1], [0, 1, 0], [0, 2, 1], ...",CHU de St Etienne,0,0,Messidor,both
639,20051214_41358_0100_PP.tif,"(2240, 1488)","[[[2, 0, 2], [2, 0, 2], [2, 0, 2], [3, 1, 3], ...",Service Ophtalmologie Lariboisière,2,0,Messidor,both
270,20060523_43267_0100_PP.tif,"(2240, 1488)","[[[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 0, 1], ...",Service Ophtalmologie Lariboisière,2,1,Messidor,both


In [27]:
# dropping the merge column
df_final.drop(columns="_merge", inplace=True)

In [28]:
# saving the final dataframe as a pickle file
with open("/Users/revanth/Documents/messidor_base.pkl", "wb") as f:
    pickle.dump(df_final, f)

In [29]:
# cleaning up the temporary directory
shutil.rmtree(temp_dir)