# びまん性肺疾患(6クラス, (train, validate, test))

## 前準備
### 主要パッケージを読み込む
loggerの設定も行う

In [None]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from logging import basicConfig, getLogger, INFO
basicConfig(level=INFO, format='%(asctime)s %(levelname)s :%(message)s')
logger = getLogger(__name__)

### データディレクトリの指定

In [None]:
DATA_ROOT = pathlib.Path('Data/Images/IP_Yamaguchi_original')
CLASSES = ('Consolidation', 'DiffuseNodular', 'Emphysema_all', 'GGO_all', 'Honeycombing', 'Normal')

### 画像ファイルを基にpd.DataFrameを作成する

In [None]:
dfs = []
for cls, class_label in enumerate(CLASSES):
    df = pd.DataFrame([(str(p), class_label, cls) for p in DATA_ROOT.glob(class_label + '/*.png')],
                      columns=['filepath', 'class_label', 'class'])
    dfs.append(df)
df_dataset = pd.concat(dfs, ignore_index=True)
df_dataset

### データ読み込み用の関数を作成

In [None]:
import tensorflow as tf
IMG_SHAPE = (32, 32, 1)


def load_img(filepath):
    return np.array(
        tf.keras.preprocessing.image.load_img(
            filepath, color_mode='grayscale',
            target_size=IMG_SHAPE))[..., np.newaxis]


def load_dataset(df):
    data = np.stack([load_img(filepath) for filepath in df['filepath']])
    labels = df['class']
    data = data / 255
    return data, labels

## Data augmentation
いくつかの画像に対して実際にaugmentationを適用し表示する

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

def create_generator():
    datagen = ImageDataGenerator(
        rotation_range=180,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='reflect',
    )
    return datagen

N_TEST = 3
def test_generator(df_dataset):
    df_train = df_dataset.iloc[:N_TEST]
    train_data, train_labels = load_dataset(df_train)
    datagen = create_generator()
    flow = datagen.flow(train_data, train_labels, batch_size=1,shuffle=False)
    for i in range(N_TEST):
        plt.figure(figsize=(4,1.5))
        plt.subplot(1,2,1)
        t = next(flow)
        plt.imshow(train_data[i].squeeze(), cmap='gray')
        plt.title('pre-augmentation')
        plt.axis('off')
        plt.subplot(1,2,2)
        plt.imshow(t[0].squeeze(), cmap='gray')
        plt.title('post-augmentation')
        plt.axis('off')
        plt.show()
test_generator(df_dataset)

## ネットワーク作成関数を作成
交差検証のfold毎に新しいmodelを作成する必要があるため。

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

def build_model():
    model = tf.keras.Sequential()
    model.add(layers.Conv2D(4, 3, activation='relu', input_shape=IMG_SHAPE))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(4, 3, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Conv2D(8, 3, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.Conv2D(8, 3, activation='relu'))
    model.add(layers.BatchNormalization())
    model.add(layers.MaxPooling2D(2))
    model.add(layers.Flatten())
    model.add(layers.Dense(32, activation='relu'))
    model.add(layers.Dense(len(CLASSES)))
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

## K-Fold 交差検証(train, validate, test)
各fold中でデータセットの$\frac{2}{4}$を学習用、$\frac{1}{4}$をvalidation(EarlyStopping)用、$\frac{1}{4}$を評価用に使用する

### DataFrameに交差検証用の列を追加する

In [None]:
import itertools
from sklearn.model_selection import StratifiedKFold

K_FOLD = 4
kfold = StratifiedKFold(n_splits=K_FOLD, shuffle=True)

test_indices = [
    t[1] for t in kfold.split(df_dataset['filepath'], df_dataset['class'])
]
index2fold = dict(
    list(
        itertools.chain(*[[(idx, i) for idx in indices]
                          for i, indices in enumerate(test_indices)])))

df_dataset['fold'] = df_dataset.index.map(index2fold)
df_dataset

### train, validate, testを用いた交差検証を行う

In [None]:
from tqdm.keras import TqdmCallback
BATCH_SIZE = 8
EPOCHS = 128
ES_PATIENCE = 16  # early stopping
LR_PATIENCE = ES_PATIENCE // 2  # ReduceLROnPlateau


def evaluate(model, data, index):
    logits = model.predict(data)
    predictions = tf.nn.softmax(logits).numpy().squeeze()
    df_result = pd.DataFrame(
        {
            'pred_logits': list(logits),
            'pred_proba': list(predictions)
        },
        index=index)
    return df_result


results = []
for i_iter, test_fold in enumerate(range(K_FOLD)):
    logger.info('{i}th iteration of {k}-fold CV'.format(i=i_iter + 1,
                                                        k=K_FOLD))
    val_fold = (test_fold + 1) % K_FOLD
    train_folds = set(range(K_FOLD)) - set([test_fold]) - set([val_fold])
    df_train = df_dataset[df_dataset['fold'].map(lambda e: e in train_folds)]
    df_val = df_dataset[df_dataset['fold'] == val_fold]
    df_test = df_dataset[df_dataset['fold'] == test_fold]
    (train_data,
     train_labels), (val_data, val_labels), (test_data, test_labels) = [
         load_dataset(df) for df in (df_train, df_val, df_test)
     ]
    model = build_model()
    datagen = create_generator()
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', restore_best_weights=True, patience=ES_PATIENCE)
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                     factor=0.5,
                                                     patience=LR_PATIENCE,
                                                     min_lr=1e-5)

    tqdm_cb = TqdmCallback(epochs=EPOCHS,
                           data_size=len(df_train),
                           batch_size=BATCH_SIZE,
                           verbose=1)
    result = model.fit(datagen.flow(train_data,
                                    train_labels.values,
                                    shuffle=True,
                                    batch_size=BATCH_SIZE),
                       validation_data=(val_data, val_labels),
                       epochs=EPOCHS,
                       shuffle=True,
                       verbose=0,
                       callbacks=[early_stopping, reduce_lr, tqdm_cb])
    plt.figure(figsize=(3, 4))
    pd.DataFrame(result.history).plot(title='Training history',figsize=(5,3))
    plt.show()
    results.append(evaluate(model, test_data, df_test.index))

In [None]:
df_result = pd.concat(results, axis=0)
df_result = df_dataset.join(df_result)
df_result['pred_class'] = df_result['pred_proba'].map(np.argmax)

## 評価
### 混同行列

In [None]:
from sklearn import metrics
cm = metrics.confusion_matrix(df_result['class'], df_result['pred_class'])
print('Accuracy = {n} / {d} = {a:.03g}%'.format(n=cm.trace(), d=cm.sum(), a=100*cm.trace() / cm.sum()))
df_cm = pd.DataFrame(cm, index=CLASSES, columns=CLASSES)
df_cm.index.name, df_cm.columns.name = 'Truth', 'Prediction'
display(df_cm)

### ROC

In [None]:
plt.figure(figsize=(4,3))
for i, cls in enumerate(CLASSES):
    fpr, tpr, thresholds = metrics.roc_curve(df_result['class']==i, df_result['pred_proba'].map(lambda e:e[i]))
    auc = metrics.auc(fpr, tpr)
    plt.plot(fpr, tpr, label='{cls} (AUC = {auc:.03g})'.format(cls=cls, auc=auc))
plt.plot((0,1),(0,1), zorder=0, color='black', alpha=.1, linestyle='-') # diagonal line
plt.xlabel('1 - Specificity')
plt.ylabel('Sensitivity')
plt.legend(loc='lower right')
plt.show()

In [None]:
report = metrics.classification_report(df_result['class'],
                                       df_result['pred_class'],
                                       target_names=CLASSES,
                                       output_dict=True)
df_report = pd.DataFrame(report)
display(df_report.T)

### クラスごとに間違えている例を表示
#### 画像ごとにlossを計算

In [None]:
df_result['loss'] = tf.keras.losses.sparse_categorical_crossentropy(
    df_result['class'], df_result['pred_logits'], from_logits=True)
display(df_result)

#### lossの値が大きい画像を表示

In [None]:
N_SAMPLES = 2
for class_label, group in df_result.groupby('class_label'):
    print(class_label)
    worst = group.sort_values('loss', ascending=False).head(N_SAMPLES)
    worst_data, worst_labels = load_dataset(worst)
    for img, pred_proba in zip(worst_data, worst['pred_proba']):
        plt.figure(figsize=(4, 1.5))
        plt.subplot(1, 2, 1)
        plt.imshow(img.squeeze(), cmap='gray')
        plt.axis('off')
        ax = plt.subplot(1, 2, 2)
        pd.DataFrame(pred_proba, index=CLASSES).plot(ax=ax,
                                                     kind='barh',
                                                     legend=False)
        plt.tight_layout()
        plt.show()