In [33]:
# importing required libraries
# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import numpy as np
import time
import warnings
import pandas as pd
import os
import parquet
from PIL import Image

# Tensor Flow
# tensorflow
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import preprocess_input

In [35]:
tf.config.list_physical_devices("CPU")

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [2]:
# Setting default options
# setting default configuration options
pd.set_option("mode.copy_on_write", True)
warnings.simplefilter(action="ignore")  # , category=FutureWarning)

## Loading Data

In [3]:
path = "../aws_s3/Messidor_Data/Base/"

In [4]:
test_files = os.listdir(path + "Test")
print(test_files[:5])

['20060530_52988_0100_PP.tif', '20060530_55468_0100_PP.tif', '20051202_56050_0400_PP.tif', '20051202_36970_0400_PP.tif', '20060411_59941_0200_PP.tif']


In [5]:
train_files = os.listdir(path + "Train")
print(train_files[:5])

['20060410_47186_0200_PP.tif', '20051020_63337_0100_PP.tif', '20060529_56730_0100_PP.tif', '20051117_37232_0400_PP.tif', '20060412_61450_0200_PP.tif']


In [6]:
mapping = pd.read_parquet("../02_Data/01_messidor_mapping.parquet")
mapping.sample(5)

Unnamed: 0,Image_ID,Department,Retinopathy_Grade,Risk_of_Macular_Edema,Data_Source,Split,Original_Size
1067,20051130_60186_0400_PP.tif,LaTIM - CHU de BREST,0,0,Messidor,Train,"[2304, 1536]"
690,20060410_47042_0200_PP.tif,CHU de St Etienne,0,0,Messidor,Train,"[1440, 960]"
241,20060522_46104_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,Train,"[2240, 1488]"
1192,20060412_59636_0200_PP.tif,CHU de St Etienne,1,0,Messidor,Train,"[1440, 960]"
371,20051021_59589_0100_PP.tif,Service Ophtalmologie Lariboisière,0,0,Messidor,Train,"[2240, 1488]"


In [13]:
X_train = np.array(
    [
        (np.array(Image.open(path + "Train/" + i))[None, ...] / 255.0).flatten()
        for i in train_files
    ]
)
y_train = np.array(
    [
        mapping.loc[mapping["Image_ID"] == i, "Retinopathy_Grade"].values[0]
        for i in train_files
    ]
)

In [14]:
X_test = np.array(
    [
        (np.array(Image.open(path + "Test/" + i))[None, ...] / 255.0).flatten()
        for i in test_files
    ]
)
y_test = np.array(
    [
        mapping.loc[mapping["Image_ID"] == i, "Retinopathy_Grade"].values[0]
        for i in test_files
    ]
)

In [15]:
X_train.shape

(1176, 786432)

In [16]:
y_train.shape

(1176,)

### Random Forest

In [17]:
# Initializing the Model
model_rf_raw = RandomForestClassifier(n_estimators=100)

# Fitting the Model
t = time.time()
model_rf_raw.fit(X_train, y_train)
print(f"Training Time: {time.time() - t:.2f} Seconds")

Training Time: 41.04 Seconds


In [18]:
# Predicting the Test Data
t = time.time()
y_pred = model_rf_raw.predict(X_test)
print(f"Prediction Time: {time.time() - t:0.2f} Seconds")

Prediction Time: 0.02 Seconds


In [19]:
# Model Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")

Accuracy: 33.33%
Confusion Matrix: 
[[6 0 0 0]
 [5 0 1 0]
 [3 0 1 2]
 [3 0 2 1]]


### Logistic Regression

In [20]:
# Initializing the Model
model_lr_raw = LogisticRegression(
    max_iter=200, multi_class="multinomial", solver="lbfgs", random_state=42
)

# Fitting the Model
t = time.time()
model_lr_raw.fit(X_train, y_train)
print(f"Training Time: {time.time() - t:0.2f} Seconds")

Training Time: 182.65 Seconds


In [21]:
t = time.time()
y_pred = model_lr_raw.predict(X_test)
print(f"Prediction Time: {time.time() - t:0.2f} Seconds")

Prediction Time: 0.02 Seconds


In [22]:
# Model Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")

Accuracy: 29.17%
Confusion Matrix: 
[[5 0 1 0]
 [2 0 3 1]
 [0 1 0 5]
 [2 0 2 2]]


### RESNET

In [23]:
# Load a pretrained model
model = tf.keras.applications.resnet50.ResNet50(
    include_top=False,  # take any input shape, not just 224x224
    weights="imagenet",  # use the imagenet pretrained model
    pooling="avg",  # use global average pooling to get a 2048 vector from our image
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [26]:
%%capture
def preprocess(img):
    processed_img = preprocess_input(  # ResNet has its own preprocesser
        tf.image.resize(  # Images need to be in (224, 224) dimension
            np.array(img), (224, 224)  # Turn PIL image object into array
        )
    )[None]
    return processed_img


# use resnet to get the embeddings
X_train_resnet = np.array(
    [model.predict(preprocess(Image.open(path + "Train/" + i))) for i in train_files]
).reshape(-1, 2048)
X_test_resnet = np.array(
    [model.predict(preprocess(Image.open(path + "Test/" + i))) for i in test_files]
).reshape(-1, 2048)

### Random Forest - ResNet Optimized

In [27]:
# Initializing the Model
model_rf_resnet = RandomForestClassifier(n_estimators=100)

# Fitting the Model
t = time.time()
model_rf_resnet.fit(X_train_resnet, y_train)
print(f"Training Time: {time.time() - t:.2f} Seconds")

Training Time: 2.64 Seconds


In [28]:
# predicting the test data
t = time.time()
y_pred = model_rf_resnet.predict(X_test_resnet)
print(f"Prediction Time: {time.time() - t:0.2f} Seconds")

Prediction Time: 0.00 Seconds


In [29]:
# Model Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")

Accuracy: 37.50%
Confusion Matrix: 
[[6 0 0 0]
 [5 0 0 1]
 [4 0 1 1]
 [4 0 0 2]]


### Logistic Regression - ResNet Optimized

In [30]:
# Initializing the Model
model_lr_resnet = LogisticRegression(
    max_iter=200, multi_class="multinomial", solver="lbfgs", random_state=42
)

# Fitting the Model
t = time.time()
model_lr_resnet.fit(X_train_resnet, y_train)
print(f"Training Time: {time.time() - t:0.2f} Seconds")

Training Time: 0.47 Seconds


In [31]:
# predicting the test data
t = time.time()
y_pred = model_lr_resnet.predict(X_test_resnet)
print(f"Prediction Time: {time.time() - t:0.2f} Seconds")

Prediction Time: 0.00 Seconds


In [32]:
# Model Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")

Accuracy: 37.50%
Confusion Matrix: 
[[5 1 0 0]
 [4 0 2 0]
 [4 0 0 2]
 [0 1 1 4]]
