# 大腸NBI血管構造分類

## 前準備
### 主要パッケージを読み込む
loggerの設定も行う

In [None]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from IPython.display import display
from logging import basicConfig, getLogger, INFO
basicConfig(level=INFO, format='%(asctime)s %(levelname)s :%(message)s')
logger = getLogger(__name__)

### データディレクトリの指定

In [None]:
DATA_ROOT = pathlib.Path('Data/Images/ColonNBI/ColonNBI_original')
CLASS_LABELS = ('ColonNBI_I', 'ColonNBI_II_IIIA', 'ColonNBI_IIIB')
IMAGE_EXT = '.png'

### 画像ファイルを基にpd.DataFrameを作成する

In [None]:
import tut_utils
df_dataset = tut_utils.create_dataset_df(DATA_ROOT, CLASS_LABELS, IMAGE_EXT)
display(df_dataset)

### クラスごとの画像数を確認する

In [None]:
df_dataset['class_label'].value_counts()

### 各クラスの画像を表示してみる

In [None]:
tut_utils.show_images_each_class(df_dataset, n_rows=1)

### データ読み込み用の関数を作成

In [None]:
import tensorflow as tf
IMG_SHAPE = (224, 224, 3)


def load_img(filepath):
    return np.atleast_3d(
        tf.keras.preprocessing.image.load_img(
            filepath,
            color_mode='grayscale' if IMG_SHAPE[2] == 1 else 'rgb',
            target_size=IMG_SHAPE))


def load_dataset(df_train, df_test):
    train_data = np.stack(
        [load_img(filepath) for filepath in df_train['filepath']])
    train_labels = df_train['class']
    test_data = np.stack(
        [load_img(filepath) for filepath in df_test['filepath']])
    test_labels = df_test['class']

    train_data = train_data / 255
    test_data = test_data / 255

    return train_data, train_labels, test_data, test_labels

## Data augmentation
いくつかの画像に対して実際にaugmentationを適用し表示する

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


def create_generator():
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        zoom_range=.1,
        shear_range=15,
        channel_shift_range=.25,
        fill_mode='reflect',
    )
    return datagen


N_TEST = 3


def test_generator(df_dataset):
    df_train = df_dataset.iloc[:N_TEST]
    df_tmp = df_dataset.iloc[N_TEST:(N_TEST + 1)]
    train_data, train_labels, test_data, test_labels = load_dataset(
        df_train, df_tmp)
    datagen = create_generator()
    flow = datagen.flow(train_data, train_labels, batch_size=1, shuffle=False)
    for i in range(N_TEST):
        plt.figure(figsize=(4, 1.5))
        plt.subplot(1, 2, 1)
        plt.imshow(train_data[i].squeeze())
        plt.title('pre-augmentation')
        plt.axis('off')
        plt.subplot(1, 2, 2)
        t = flow[i]
        plt.imshow(t[0].squeeze())
        plt.title('post-augmentation')
        plt.axis('off')
        plt.show()


test_generator(df_dataset)

## モデル作成
[ImageNet](http://www.image-net.org/)で事前学習されたMobileNetV2を基にモデルを作成する。
[参考](https://www.tensorflow.org/tutorials/images/transfer_learning)
<div class="alert alert-block alert-warning">
<b>注意:</b> BatchNormalizationレイヤーのmomentumは手動で再設定する</div>

In [None]:
import tensorflow as tf
import tensorflow.python
import tensorflow.python.keras
from tensorflow import keras
from tensorflow.keras import layers
BN_MOMENTUM = 0.90


def is_batch_normalization(layer):
    '''
    Check if the layer is a batch normalization layer.
    Note: Not the most robust way of checking.
    '''
    return layer.__class__.__name__ == 'BatchNormalization'


def set_momentum(model, momentum):
    for layer in model.layers:
        if is_batch_normalization(layer):
            layer.momentum = momentum


def build_model_pretrained():
    base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                                                   include_top=False,
                                                   weights='imagenet')
    set_momentum(base_model, BN_MOMENTUM)
    model = tf.keras.Sequential([
        base_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        tf.keras.layers.Dense(len(CLASS_LABELS))
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-4),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy'])
    return model


model = build_model_pretrained()

## K-Fold 交差検証
`loss`の値を早期終了の基準として使用する。
<div class="alert alert-block alert-warning">
<b>注意:</b> 今回、trainingデータを用いて学習を終了させていますが、本来はvalidationデータ(trainingでもtestでもないデータ)を用いた方がよいです。
</div>

In [None]:
from tqdm.keras import TqdmCallback
from sklearn.model_selection import StratifiedKFold
K_FOLD = 3
kfold = StratifiedKFold(n_splits=K_FOLD, shuffle=True)

BATCH_SIZE = 8
EPOCHS = 128
PATIENCE = 16


def evaluate_multiclass(model, data, index):
    logits = model.predict(data)
    predictions = tf.nn.softmax(logits).numpy()
    df_result = pd.DataFrame(
        {
            'pred_logits': list(logits),
            'pred_proba': list(predictions),
            'pred_class': np.argmax(predictions, axis=1)
        },
        index=index)
    return df_result


results = []
for i_iter, (train_index, test_index) in enumerate(
        kfold.split(df_dataset['filepath'], df_dataset['class'])):
    logger.info('{i}th iteration of {k}-fold CV'.format(i=i_iter + 1,
                                                        k=K_FOLD))
    df_train = df_dataset.iloc[train_index]
    df_test = df_dataset.iloc[test_index]
    train_data, train_labels, test_data, test_labels = load_dataset(
        df_train, df_test)
    model = build_model_pretrained()
    datagen = create_generator()
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        restore_best_weights=True,
        patience=PATIENCE,
        min_delta=0.01)
    result = model.fit(datagen.flow(train_data,
                                    train_labels.values,
                                    shuffle=True,
                                    batch_size=BATCH_SIZE),
                       epochs=EPOCHS,
                       verbose=0,
                       callbacks=[early_stopping,
                                  TqdmCallback(verbose=1)])
    pd.DataFrame(result.history).plot(title='Training history', figsize=(5, 3))
    plt.show()
    results.append(evaluate_multiclass(model, test_data, test_index))

In [None]:
df_result = pd.concat(results, axis=0)
df_result = df_dataset.join(df_result)
display(df_result)

## 評価
### 混同行列

In [None]:
df_cm = tut_utils.confusion_matrix(df_result)
print('Accuracy = {n} / {d} = {a:.03g}%'.format(n=df_cm.values.trace(),
                                                d=df_cm.values.sum(),
                                                a=100 * df_cm.values.trace() / df_cm.values.sum()))

display(df_cm)

### ROCカーブ

In [None]:
tut_utils.plot_roc_curves(df_result, figsize=(4, 3))
plt.show()