# APTOS 2019 Blindness Detection


다음 커널을 base로 작성하였습니다.


https://www.kaggle.com/carlolepelaars/efficientnetb5-with-keras-aptos-2019/data

상수 정의

In [None]:
import pandas as pd
import os.path

# image directory path
#../input/aptos-train-dataset/aptos-train-images/aptos-train-images/

#original data : ../input/aptos2019-blindness-detection
TRAIN_DATA_PATH = "../input/aptos-train-dataset"
TEST_DATA_PATH = "../input/aptos2019-blindness-detection"
PREPROCESSED_IMAGE_PATH = "./preprocessed"
MODEL_PATH = "./models"

TRAIN_CSV_FILE_PATH = os.path.join(TRAIN_DATA_PATH, "train.csv")
TRAIN_IMAGE_FILE_PATH = "../input/aptos-train-dataset/aptos-train-images/aptos-train-images"

TEST_CSV_FILE_PATH = "../input/aptos2019-blindness-detection/test.csv"
TEST_IMAGE_FILE_PATH = "../input/aptos2019-blindness-detection/test_images"

FOLDED_DATASETS_PATH = "../input/APTOS_data_files"

"""
전체 커널이 제대로 돌아기는지 확인할 때 사용한다.
submission이 정상적으로 진행되는지까지 학인
"""
CHECK_KERNEL_VALID = True

IMG_WIDTH = 224
IMG_HEIGHT = 224
IMG_CHANNELS = 3

BATCH_SIZE = 32

NUM_FOLDS = 6

EPOCHS = 40
if CHECK_KERNEL_VALID:
    EPOCHS = 6

GENERATE_WEIGHTS = True

ASSIGNED_FOLD_JOBS = [x for x in range(NUM_FOLDS)]

def constants_check():
    pd.read_csv(TRAIN_CSV_FILE_PATH)
    pd.read_csv(TEST_CSV_FILE_PATH)
    
    train_imgs_count = len(os.listdir(TRAIN_IMAGE_FILE_PATH))
    test_imgs_count = len(os.listdir(TEST_IMAGE_FILE_PATH))
                           
    print("train images : ", train_imgs_count)
    print("test images : ", test_imgs_count)
    
    assert(train_imgs_count > 100)
    assert(test_imgs_count > 100)
    
constants_check()

In [None]:
from pathlib import Path
import shutil

if os.path.exists(MODEL_PATH) == False:
    Path(MODEL_PATH).mkdir(parents=True, exist_ok=True)


# weight를 생성하려는 목적이 아니면 weight파일을 미리 복사해 둔다.

pre_models_path = "../input/aptos-data-files"

if os.path.exists(pre_models_path):
    for fname in os.listdir(pre_models_path):
        filepath = os.path.join(pre_models_path, fname)
        print(filepath)
        if os.path.isfile(filepath):
            if GENERATE_WEIGHTS == True:
                if fname.find("h5") > 0:
                    continue
            destfilepath = os.path.join(MODEL_PATH, fname)
            print("copy file ", filepath, " to ", destfilepath)
            shutil.copy(filepath, destfilepath)

In [None]:
df_train = pd.read_csv(TRAIN_CSV_FILE_PATH)
df_test = pd.read_csv(TEST_CSV_FILE_PATH)
df_train.head()

In [None]:
import cv2
#from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches

n = 3

fix, ax = plt.subplots(n, n, figsize = (16, 16))
axidx = 0

df_sample = df_train.sample(n * n)
for idx, row in df_sample.iterrows():
    imgpath = os.path.join(TRAIN_IMAGE_FILE_PATH, row['id_code'])
    
    im = cv2.imread(imgpath)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im = cv2.addWeighted(im, 4, cv2.GaussianBlur(im, (0,0) ,10), -4, 128)

    ax[int(axidx / n)][axidx % n].imshow(im)
    axidx += 1

## Preprocessing

In [None]:
from pathlib import Path
import cv2
import numpy as np

from PIL import Image, ImageChops

default_ratio = 1.0

def crop_image_from_gray(img, tol=7):
    """
    Applies masks to the orignal image and 
    returns the a preprocessed image with 
    3 channels
    """
    # If for some reason we only have two channels
    if img.ndim == 2:
        mask = img > tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    # If we have a normal RGB images
    elif img.ndim == 3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        mask = gray_img > tol
        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0): # image is too dark so that we crop out everything,
            return img # return original image
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
            img = np.stack([img1,img2,img3],axis=-1)
        return img

#from : https://www.kaggle.com/carlolepelaars/efficientnetb5-with-keras-aptos-2019/data
def preprocess_image(path):
    """
    The whole preprocessing pipeline:
    1. Read in image
    2. Apply masks
    3. Resize image to desired size
    4. Add Gaussian noise to increase Robustness
    """
    
    im = cv2.imread(path)
    im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    im = crop_image_from_gray(im)
    im = cv2.addWeighted(im, 4, cv2.GaussianBlur(im, (0,0) ,10), -4, 128)
    im = cv2.resize(im, (IMG_WIDTH, IMG_HEIGHT))
    return im
    


In [None]:
fix, ax = plt.subplots(n, n, figsize = (20, 20))

axidx = 0    
for idx, row in df_sample.iterrows():
    filename = row['id_code']
    imgpath = os.path.join(TRAIN_IMAGE_FILE_PATH, filename)
    im = preprocess_image(imgpath)
    ax[int(axidx / (n))][axidx % n].imshow(im)
    ax[int(axidx / (n))][axidx % n].set_title(row['id_code'])
    axidx += 1

In [None]:
import os
import sys
sys.path.append(os.path.abspath('../input/efficientnet/efficientnet-master/efficientnet-master/'))

from efficientnet import EfficientNetB5

## Metrics

In [None]:
def get_preds_and_labels(model, generator):
    """
    Get predictions and labels from the generator
    """
    preds = []
    labels = []
    for _ in range(int(np.ceil(generator.samples / BATCH_SIZE))):
        x, y = next(generator)
        preds.append(model.predict(x))
        labels.append(y)
    # Flatten list of numpy arrays
    return np.concatenate(preds).ravel(), np.concatenate(labels).ravel()



## Evaluation

submission 평가가 QWK(Quadratic Weighted Kappa) 기반으로 이루어진다.

### Cohen's Kappa
두 연구자간 동일한 결과를 내놓는지를 수치화하는 방법이다. <br>
더 자세히 설명하면, 두 연구자 간 일치한 결과 중에서 우연히 일치할 가능성를 제외하고, 실제로 평가가 일치한 결과가 어느 정도인지 보여주는 지표이다.<br>
nominal(category간 거리가 같은)한 범주에 사용된다.

### Cohen's weighted Kappa
Cohen's Kappa와는 다르게, ordinal(순서가 있는, 예를 들어 관절염의 5 단계(1:없음, 2:경증 ... 5:심각) 등을 표현 시) 변수를 대상으로 할 경우에는 Cohen's weighted Kappa를 사용한다. <br>
순서(또는 단계)가 있는 변수를 판단하므로 범주(카테고리)간 거리는 서로 다르고, 두 연구자간 결과가 다를 경우에도 다름의 크기에 가중치 차이가 있을 것이다.<br>
이런 식으로 각각 다른 비중(weight)를 두어 불일치 정도를 평가하는 것이다.

각 범주간 차이에 비중을 부여하는 방법으로는 값의 차이를 그대로 사용하는 linear 방법과, 제곱해서 사용하는 quadratic 방법이 있다.<br>
(1과 3의 차이 : linear = 2, quadratic = 4)




QWK가 개선되는 경우 모델을 저장하는 custom callback을 정의해서 train시 사용한다.

> **sklearn.metrics.cohen_kappa_score(y1, y2, labels=None, weights=None, sample_weight=None)**

Cohen’s kappa: a statistic that measures inter-annotator agreement.



In [None]:
from sklearn.metrics import cohen_kappa_score

from keras.callbacks import Callback

class Metrics(Callback):
    """
    A custom Keras callback for saving the best model
    according to the Quadratic Weighted Kappa (QWK) metric
    """
    def __init__(self, model, val_generator, model_save_filepath):
        self.model = model
        self.val_generator = val_generator
        self.model_save_filepath = model_save_filepath
        
    def on_train_begin(self, logs={}):
        """
        Initialize list of QWK scores on validation data
        """
        self.val_kappas = []

    def on_epoch_end(self, epoch, logs={}):
        """
        Gets QWK score on the validation data
        """
        # Get predictions and convert to integers
        y_pred, labels = get_preds_and_labels(self.model, self.val_generator)
        y_pred = np.rint(y_pred).astype(np.uint8).clip(0, 4)
        # We can use sklearns implementation of QWK straight out of the box
        # as long as we specify weights as 'quadratic'
        _val_kappa = cohen_kappa_score(labels, y_pred, weights='quadratic')
        self.val_kappas.append(_val_kappa)
        print(f"val_kappa: {round(_val_kappa, 4)}")
        if _val_kappa == max(self.val_kappas):
            print("Validation Kappa has improved. Saving model.")
            self.model.save(self.model_save_filepath)
        return

## Modeling

callback 정의

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint


def get_callbacks(model, val_generator, model_save_filepath):
    # Monitor MSE to avoid overfitting and save best model
    es = EarlyStopping(monitor='val_loss', mode='auto', verbose=1, patience=15)
    lr = ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=4)
    km = Metrics(model, val_generator, model_save_filepath)
    return [es, lr, km]

EfficientNetB5을 base로 하고, 출력은 linear로 뽑는다.<br>
output이 nomial하기 때문에 linear로 출력하고 loss는 MSE로 한다.

In [None]:
from keras import backend as K
from keras.activations import elu
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, GlobalAveragePooling2D, Dropout


def build_model_effnet_b5(load_weights = True):
    """
    A custom implementation of EfficientNetB5
    for the APTOS 2019 competition
    (Regression)
    """
    
    # Load in EfficientNetB5
    effnet_b5 = EfficientNetB5(weights=None,
                        include_top=False,
                        input_shape=(IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS))
    if load_weights == True:
        effnet_b5.load_weights('../input/efficientnet-keras-weights-b0b5/efficientnet-b5_imagenet_1000_notop.h5')
    
    model = Sequential()
    model.add(effnet_b5)
    model.add(GlobalAveragePooling2D())
    model.add(Dropout(0.5))
    model.add(Dense(5, activation=elu))
    model.add(Dense(1, activation="linear"))
    model.compile(loss='mse',
                  optimizer=Adam(0.0001), 
                  metrics=['mse', 'acc'])
    print(model.summary())
    return model

models_list = ["effnet_b5"]

def get_model(m, load_weights = True):
    if m == "effnet_b5":
        return build_model_effnet_b5(load_weights)

In [None]:
def get_total_batch(num_samples, batch_size):    
    if (num_samples % batch_size) > 0 :
        return (num_samples // batch_size) + 1
    else :
        return num_samples // batch_size

In [None]:
from keras.backend.tensorflow_backend import set_session
from keras.backend.tensorflow_backend import clear_session
from keras.backend.tensorflow_backend import get_session

# Reset Keras Session
def reset_keras():
    sess = get_session()
    clear_session()
    sess.close()

In [None]:
import os
import gc
import psutil 

from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator


def train_one_fold(model_name, fold_index):
    model = get_model(model_name)

    model_save_filename = ("%s_%d.h5" % (model_name , fold_index))
    model_save_filepath = os.path.join(MODEL_PATH, model_save_filename)

    # load folded dataframe
    df_train_filename = ("fold_%d_train.csv" % fold_index)
    df_val_filename = ("fold_%d_val.csv" % fold_index)

    dataframe_train = pd.read_csv(os.path.join(MODEL_PATH, df_train_filename))
    dataframe_val = pd.read_csv(os.path.join(MODEL_PATH, df_val_filename))

    # for test :
    if CHECK_KERNEL_VALID == True:
        dataframe_train = dataframe_train.sample(int(dataframe_train.shape[0] / 80))
        dataframe_val = dataframe_val.sample(int(dataframe_val.shape[0] / 80))
        
    print("Data Counts: train=", dataframe_train.shape[0], " validation=", dataframe_val.shape[0])
    
    # Add Image augmentation to our generator
    train_datagen = ImageDataGenerator(rescale = 1./255,
                                       rotation_range=360,
                                       horizontal_flip=True,
                                       vertical_flip=True)

    val_datagen = ImageDataGenerator(rescale = 1./255)

    # Use the dataframe to define train and validation generators
    train_generator = train_datagen.flow_from_dataframe(dataframe_train, 
                                                        x_col='id_code', 
                                                        y_col='diagnosis',
                                                        directory = TRAIN_IMAGE_FILE_PATH,
                                                        target_size=(IMG_WIDTH, IMG_HEIGHT),
                                                        batch_size=BATCH_SIZE,
                                                        class_mode='other',
                                                        preprocessing_function=preprocess_image)

    val_generator = val_datagen.flow_from_dataframe(dataframe_val, 
                                                      x_col='id_code',
                                                      y_col='diagnosis',
                                                      directory = TRAIN_IMAGE_FILE_PATH,
                                                      target_size=(IMG_WIDTH, IMG_HEIGHT),
                                                      batch_size=BATCH_SIZE,
                                                      class_mode='other',
                                                      preprocessing_function=preprocess_image)
    if GENERATE_WEIGHTS == True:
        if os.path.exists(model_save_filepath) == True:
            os.remove(model_save_filepath)

    # skip if weight file exists
    if os.path.exists(model_save_filepath) == True:
        print(">>>>>>>>>>", model_save_filepath, " already trained... skip!")
        return

    train_steps = get_total_batch(train_generator.samples, BATCH_SIZE)
    val_steps = get_total_batch(val_generator.samples, BATCH_SIZE)
    print("Steps : train=", train_steps, " validation=", val_steps)

    # make callbacks
    callbacks = get_callbacks(model=model, val_generator=val_generator, model_save_filepath=model_save_filepath)

    # First training phase (train top layer)
    model.fit_generator(train_generator,
                        steps_per_epoch = train_steps,
                        epochs = EPOCHS,
                        validation_data = val_generator,
                        validation_steps = val_steps,
                        callbacks = callbacks)
    
    
    
    
def train_models():
    global models_list
    for _m in models_list:
        for fold_index in ASSIGNED_FOLD_JOBS:
            
            print("")
            print("========================================================")
            print("Model : ", _m, "/ fold : ", fold_index)
            print("========================================================")
            print("")
            
            train_one_fold(_m, fold_index)
                        
            # clear used memory
            K.clear_session()
            for i in range(20):
                gc.collect()            
            

train_models()

## Submission

최고 QWK값을 기록한 weight를 사용한다.

In [None]:
'''
def load_sub_models():
    sub_models = []
    for _m in models_list:
        print("Model ", _m, " : ")
        for _, _, filenames in os.walk(MODEL_PATH):
            for fname in filenames:
                if fname.find(_m) >= 0 and fname.find(".h5") >= 0:                    
                    model = get_model(_m, load_weights = False)
                    
                    fpath = os.path.join(MODEL_PATH, fname)
                    print(">>>>>>>>>> Loading weight file :", fpath)
                    model.load_weights(fpath)
                    
                    sub_models.append(model)
                    
    return sub_models

sub_models = load_sub_models()
'''

def get_sub_model(model_name, fold_index):
    
    model_save_filename = ("%s_%d.h5" % (model_name , fold_index))
    model_save_filepath = os.path.join(MODEL_PATH, model_save_filename)
    
    model = get_model(m, load_weights = False)
    model.load_weights(model_save_filepath)
    
    return model

In [None]:
import numpy as np
import scipy as sp
from functools import partial

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa score
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        """
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        """
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return X_p

    def coefficients(self):
        return self.coef_['x']

In [None]:
optR = OptimizedRounder()
optR.fit([0.2, 3.2, 4.8], [0, 3, 4])

In [None]:
import numpy as np
from tqdm import tqdm

# Preprocess test images
N = df_test.shape[0]
x_test = np.empty((N, IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS), dtype=np.uint8)
for i, image_id in enumerate(df_test['id_code']):
    x_test[i, :, :, :] = preprocess_image(f'{TEST_IMAGE_FILE_PATH}/{image_id}.png')

In [None]:
print(x_test.shape)

In [None]:
# TTA_STEPS = 4
predictions = []

from keras.preprocessing.image import ImageDataGenerator


# for model in submodels:
#     flow = tta_datagen.flow_from_dataframe(df_test,
#                                             x_col='id_code', 
#                                             y_col='diagnosis',
#                                             directory = TEST_IMAGE_FILE_PATH,
#                                             target_size=(IMG_WIDTH, IMG_HEIGHT),
#                                             batch_size=BATCH_SIZE,
#                                             class_mode='other',
#                                             preprocessing_function=preprocess_image)
#     preds = []
    
#     steps = get_total_batch(df_test.shape[0], BATCH_SIZE)
    
#     for i in range(TTA_STEPS):
#         flow.reset()
#         pred = model.predict_generator(generator = flow, steps = steps, verbose = 1)
#         preds.append(pred)
        
#     pred_tta = np.mean(preds, axis=0)
#     prediction.append(pred_tta)

for _m in models_list:    
    for fold_index in ASSIGNED_FOLD_JOBS:
        
        # Add Image augmentation to our generator
        tta_datagen = ImageDataGenerator(rotation_range=360,
                                         horizontal_flip=True,
                                         vertical_flip=True,
                                         validation_split=0.15)

        model = get_sub_model(_m, fold_index)
        flow = tta_datagen.flow_from_dataframe(df_test,
                                            x_col='id_code', 
                                            y_col='diagnosis',
                                            directory = TEST_IMAGE_FILE_PATH,
                                            target_size=(IMG_WIDTH, IMG_HEIGHT),
                                            batch_size=BATCH_SIZE,
                                            class_mode='other',
                                            preprocessing_function=preprocess_image)
        preds = []
    
        steps = get_total_batch(df_test.shape[0], BATCH_SIZE)
    
        for i in range(TTA_STEPS):
            flow.reset()
            pred = model.predict_generator(generator = flow,
                                           steps = steps,
                                           verbose = 1)
            preds.append(pred)
        
        pred_tta = np.mean(preds, axis=0)
        prediction.append(pred_tta)
        
        # clear used memory
        K.clear_session()
        for i in range(20):
            gc.collect() 
    
pred = np.mean(prediction, axis=0)

y_test = optR.predict(y_test, coefficients).astype(int)
df_test['diagnosis'] = y_test
# Remove .png, .jpeg, .jpg from ids
df_test['id_code'] = df_test['id_code'].str.replace(r'.png$', '')
df_test['id_code'] = df_test['id_code'].str.replace(r'.jpeg$', '')
df_test['id_code'] = df_test['id_code'].str.replace(r'.jpg$', '')
test_df.to_csv('submission.csv', index=False)

In [None]:
# TTA_STEPS = 4
predictions = []

from keras.preprocessing.image import ImageDataGenerator

# Add Image augmentation to our generator
tta_datagen = ImageDataGenerator(rotation_range=360,
                                   horizontal_flip=True,
                                   vertical_flip=True,
                                   validation_split=0.15)

for model in submodels:
    flow = tta_datagen.flow_from_dataframe(df_test,
                                            x_col='id_code', 
                                            y_col='diagnosis',
                                            directory = TEST_IMAGE_FILE_PATH,
                                            target_size=(IMG_WIDTH, IMG_HEIGHT),
                                            batch_size=BATCH_SIZE,
                                            class_mode='other',
                                            preprocessing_function=preprocess_image)
    preds = []
    
    steps = get_total_batch(df_test.shape[0], BATCH_SIZE)
    
    for i in range(TTA_STEPS):
        flow.reset()
        pred = model.predict_generator(generator = flow, steps = steps, verbose = 1)
        preds.append(pred)
        
    pred_tta = np.mean(preds, axis=0)
    prediction.append(pred_tta)
    
pred = np.mean(prediction, axis=0)

y_test = optR.predict(y_test, coefficients).astype(int)
df_test['diagnosis'] = y_test
# Remove .png, .jpeg, .jpg from ids
df_test['id_code'] = df_test['id_code'].str.replace(r'.png$', '')
df_test['id_code'] = df_test['id_code'].str.replace(r'.jpeg$', '')
df_test['id_code'] = df_test['id_code'].str.replace(r'.jpg$', '')
test_df.to_csv('submission.csv', index=False)