# 肺結節の良悪性判定(K-Fold 交差検証)

## 前準備
### 主要パッケージを読み込む
loggerの設定も行う

In [None]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from logging import basicConfig, getLogger, INFO
basicConfig(level=INFO, format='%(asctime)s %(levelname)s :%(message)s')
logger = getLogger(__name__)

### データディレクトリの指定

In [None]:
DATA_ROOT = pathlib.Path('Data/Images/PN_Osaka/PN_64')
CLASS_LABELS = ('Benign', 'Malignant')
IMAGE_EXT = '.jpg'

### 画像ファイルを基にpd.DataFrameを作成する
クラスごとにDataFrameを作成した後に結合する。

In [None]:
import tut_utils
df_dataset = tut_utils.create_dataset_df(DATA_ROOT, CLASS_LABELS, IMAGE_EXT)
display(df_dataset)

### データ読み込み用の関数を作成

In [None]:
import tensorflow as tf
IMG_SHAPE = (64, 64, 1)


def load_img(filepath):
    return np.atleast_3d(
        tf.keras.preprocessing.image.load_img(
            filepath,
            color_mode='grayscale' if IMG_SHAPE[2] == 1 else 'rgb',
            target_size=IMG_SHAPE))


def load_dataset(df_train, df_test):
    train_data = np.stack(
        [load_img(filepath) for filepath in df_train['filepath']])
    train_labels = df_train['class']
    test_data = np.stack(
        [load_img(filepath) for filepath in df_test['filepath']])
    test_labels = df_test['class']

    train_data = train_data / 255
    test_data = test_data / 255

    return train_data, train_labels, test_data, test_labels

## Data augmentation
いくつかの画像に対して実際にaugmentationを適用し表示する

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator


def create_generator():
    datagen = ImageDataGenerator(
        rotation_range=180,
        width_shift_range=0.05,
        height_shift_range=0.05,
        horizontal_flip=True,
        vertical_flip=True,
        zoom_range=.05,
        fill_mode='reflect',
    )
    return datagen


N_TEST = 3


def test_generator(df_dataset):
    df_train = df_dataset.iloc[:N_TEST]
    df_tmp = df_dataset.iloc[N_TEST:(N_TEST + 1)]
    train_data, train_labels, test_data, test_labels = load_dataset(
        df_train, df_tmp)
    datagen = create_generator()
    flow = datagen.flow(train_data, train_labels, batch_size=1, shuffle=False)
    for i in range(N_TEST):
        plt.figure(figsize=(4, 1.5))
        plt.subplot(1, 2, 1)
        plt.imshow(train_data[i].squeeze(), cmap='gray')
        plt.title('pre-augmentation')
        plt.axis('off')
        plt.subplot(1, 2, 2)
        t = flow[i]
        plt.imshow(t[0].squeeze(), cmap='gray')
        plt.title('post-augmentation')
        plt.axis('off')
        plt.show()


test_generator(df_dataset)

## ネットワーク作成関数を作成
関数にするのは交差検証のfold毎に新しいmodelを作成する必要があるため。

In [None]:
import tensorflow as tf
from tensorflow.keras import layers


def build_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(4, 3, activation='relu', input_shape=IMG_SHAPE))
    model.add(layers.Conv2D(4, 3, activation='relu'))
    model.add(layers.BatchNormalization(momentum=0.90))
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Conv2D(8, 3, activation='relu'))
    model.add(layers.Conv2D(8, 3, activation='relu'))
    model.add(layers.BatchNormalization(momentum=0.90))
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Conv2D(16, 3, activation='relu'))
    model.add(layers.Conv2D(16, 3, activation='relu'))
    model.add(layers.BatchNormalization(momentum=0.90))
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Dropout(.25))
    model.add(layers.Flatten())
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

## K-Fold 交差検証
分割にはsklearnの[StratifiedKFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html)を使う。
<div class="alert alert-block alert-warning">
<b>注意:</b> 今回、epoch数は決め打ちしてありますが、本来はvalidationデータを用いて学習を終了させる必要があります。
</div>

In [None]:
from sklearn.model_selection import StratifiedKFold
K_FOLD = 3
kfold = StratifiedKFold(n_splits=K_FOLD, shuffle=True)

from tqdm.keras import TqdmCallback
BATCH_SIZE = 8
EPOCHS = 64


def predict_binary(model, data, index):
    logits = model.predict(data).squeeze()
    predictions = tf.nn.sigmoid(logits).numpy()
    df_result = pd.DataFrame(
        {
            'pred_logits': logits,
            'pred_proba': predictions,
            'pred_class': predictions > .5
        },
        index=index)
    return df_result


results = []
for i_iter, (train_index, test_index) in enumerate(
        kfold.split(df_dataset['filepath'], df_dataset['class'])):
    logger.info('{i}th iteration of {k}-fold CV'.format(i=i_iter + 1,
                                                        k=K_FOLD))
    df_train = df_dataset.iloc[train_index]
    df_test = df_dataset.iloc[test_index]
    train_data, train_labels, test_data, test_labels = load_dataset(
        df_train, df_test)
    model = build_model()
    datagen = create_generator()
    tqdm_cb = TqdmCallback(epochs=EPOCHS,
                           data_size=len(df_train),
                           batch_size=BATCH_SIZE,
                           verbose=1)
    result = model.fit(datagen.flow(train_data,
                                    train_labels,
                                    shuffle=True,
                                    batch_size=BATCH_SIZE),
                       epochs=EPOCHS,
                       verbose=0,
                       callbacks=[tqdm_cb])
    pd.DataFrame(result.history).plot(title='Training history', figsize=(5, 3))
    plt.show()
    print(model.evaluate(test_data, test_labels, verbose=0))
    results.append(predict_binary(model, test_data, test_index))

In [None]:
df_result = pd.concat(results, axis=0)
df_result = df_dataset.join(df_result)
display(df_result)

## 評価
### 混同行列

In [None]:
from sklearn import metrics


def confusion_matrix(df_result):
    class_labels = df_result['class_label'].unique()
    cm = metrics.confusion_matrix(df_result['class'], df_result['pred_class'])
    df_cm = pd.DataFrame(cm, index=class_labels, columns=class_labels)
    df_cm.index.name, df_cm.columns.name = 'Truth', 'Prediction'
    return df_cm


df_cm = confusion_matrix(df_result)
print('Accuracy = {n} / {d} = {a:.03g}%'.format(n=df_cm.values.trace(),
                                                d=df_cm.values.sum(),
                                                a=100 * df_cm.values.trace() / df_cm.values.sum()))

display(df_cm)

### ROCカーブ

In [None]:
tut_utils.plot_roc_curves(df_result)
plt.show()

### カットオフ値
[Youden's J statistic](https://en.wikipedia.org/wiki/Youden's_J_statistic)をもとにカットオフ値を決める。

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(df_result['class'],
                                                 df_result['pred_proba'])
cutoff_idx = np.argmax(np.array(tpr)-np.array(fpr))
cutoff_fpr, cutoff_tpr, cufoff = fpr[cutoff_idx], tpr[cutoff_idx], thresholds[cutoff_idx]
print('cut-off={:.3g}, sensitivity={:.3g}, specificity={:.3g}'.format(cufoff, cutoff_tpr, 1-cutoff_fpr))

### Probability Calibration curves

In [None]:
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(df_result['class'],
                                                                df_result['pred_proba'],
                                                                n_bins=10)
brier_score = metrics.brier_score_loss(df_result['class'], df_result['pred_proba'])
plt.subplot(2,1,1)
plt.plot([0, 1], [0, 1], 'k:')
plt.plot(mean_predicted_value, fraction_of_positives, 's-', label='Brier score = {:.3g}'.format(brier_score))
plt.legend()
plt.title('calibration curve')
plt.subplot(2,1,2)
plt.hist(df_result['class'],histtype="step",label='true')
plt.hist(df_result['pred_proba'],histtype="step",label='pred')
plt.title('distribution')
plt.legend(loc='upper center')
plt.tight_layout()
plt.show()

## Decision Curve Analysis
まだ