In [None]:
DEBUG = False
import warnings
if not DEBUG:
    warnings.filterwarnings('ignore')

In [None]:
YADSPH = False
if YADSPH:
    %pip install tensorflow==2.3.1
    %pip install tensorflow-addons==0.11.2
    %pip install efficientnet
    %pip install imgaug
    %pip install albumentations

In [None]:
#!g1.1
import warnings
if not DEBUG:
    warnings.filterwarnings('ignore')
import os
import gc
import cv2
import json
import time
import random
import numpy as np
import pandas as pd 
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import albumentations as A
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
from tensorflow.keras import Model, Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import *
from tensorflow.keras.metrics import AUC, CategoricalAccuracy
from tqdm import tqdm
import efficientnet.tfkeras as efn
print('tensorflow version:', tf.__version__)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    for gpu_device in gpu_devices:
        print('device available:', gpu_device)
pd.set_option('display.max_columns', None)

In [None]:
TWOCLS = False
VER = 'vbin1' if TWOCLS else 'v14'
PARAMS = {
    'version': VER,
    'folds': 5,
    'folds_train': 2 if DEBUG else None,
    'img_size': 512, # 224=B0 240=B1 260=B2 300=B3 380=B4 456=B5 528=B6 600=B7
    'epochs': 4 if DEBUG else 100,
    'patience': 2 if DEBUG else 10,
    'decay': False,
    'batch_size': 4,
    'backbone': 3, # efficientnetbX => X from 0 to 7
    'seed': 2021,
    'aughard': False if DEBUG else True,
    'lr': .0005,
    'lbl_smth': .0001,
    'metric': 'auc', # 'categorical_accuracy'
    'pseudo_th': None,
    'comments': ''
}
DATA_PATH = './data'
if YADSPH:
    DATASET_PATH = f'./data2/SIIM-COVID19-Resized/img_sz_{PARAMS["img_size"]}'
    IMGS_PATH = f'{DATASET_PATH}/train'
else:
    if PARAMS['pseudo_th']:
        IMGS_PATH = f'{DATA_PATH}/train_{PARAMS["img_size"]}_psd'
    else:
        IMGS_PATH = f'{DATA_PATH}/train_{PARAMS["img_size"]}'
MDLS_PATH = f'./models_{VER}'
if not os.path.exists(MDLS_PATH):
    os.mkdir(MDLS_PATH)
with open(f'{MDLS_PATH}/params.json', 'w') as file:
    json.dump(PARAMS, file)
    
def seed_all(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_all(PARAMS['seed'])
start_time = time.time()

In [None]:
if YADSPH:
    train_df = pd.read_csv(f'{DATASET_PATH}/meta_sz_{PARAMS["img_size"]}.csv')
else:
    train_df = pd.read_csv(f'{DATA_PATH}/train_meta_{PARAMS["img_size"]}.csv')
display(train_df.head())
if DEBUG:
    train_df = train_df.sample(100)
df_train_img = pd.read_csv(f'{DATA_PATH}/train_image_level.csv')
df_train_sty = pd.read_csv(f'{DATA_PATH}/train_study_level.csv')

if YADSPH:
    train_df['img'] = train_df['image_id'].apply(lambda x: ''.join([x, '.jpg']))
    train_df.rename(columns={'dim1': 'dim_x', 'dim0': 'dim_y'}, inplace=True)
    train_df['id'] = train_df['img'].apply(lambda x: x.split('/')[-1].replace('.jpg', '_image'))
else:
    train_df['id'] = train_df['img'].apply(lambda x: x.split('/')[-1].replace('.png', '_image'))

df_train_sty['StudyInstanceUID'] = df_train_sty['id'].apply(lambda x: x.replace('_study', ''))
del df_train_sty['id']
df_train_img = df_train_img.merge(df_train_sty, on='StudyInstanceUID')
train_df = df_train_img.merge(train_df, on='id')
train_df['None Opacity'] = train_df['boxes'].isnull()
print(train_df.shape)
display(train_df.head())

In [None]:
if PARAMS['pseudo_th']:
    df_pseudo_img = pd.read_csv(f'{DATA_PATH}/pseudo_study_level.csv')
    df_pseudo_img['StudyInstanceUID'] = df_pseudo_img['id'].apply(lambda x: x.replace('_study', ''))
    df_pseudo_img['id'] = df_pseudo_img['img']
    df_pseudo_img['img'] = df_pseudo_img['img'].apply(lambda x: x.replace('_image', '.png'))
    cols = [
        'Negative for Pneumonia',
        'Typical Appearance',
        'Indeterminate Appearance',
        'Atypical Appearance'
    ]
    df_pseudo_img[cols] = np.where(df_pseudo_img[cols] > PARAMS['pseudo_th'], 1, 0)
    df_pseudo_img['None Opacity'] = df_pseudo_img['None Opacity'].apply(
        lambda x: True if x > PARAMS['pseudo_th'] else False
    )
    df_pseudo_img = df_pseudo_img.drop(df_pseudo_img[df_pseudo_img[cols].sum(axis=1) == 0].index)
    print(df_pseudo_img.shape)
    display(df_pseudo_img.head())
    train_df = train_df.append(df_pseudo_img)
print(train_df.shape)

In [None]:
def bar_plot(train_df, variable):
    var = train_df[variable]
    varValue = var.value_counts()
    plt.figure(figsize = (12, 3))
    plt.bar(varValue.index, varValue)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{}: \n{}".format(variable, varValue))

train_df['target'] = 'Negative for Pneumonia'
train_df.loc[train_df['Typical Appearance'] == 1, 'target'] = 'Typical Appearance'
train_df.loc[train_df['Indeterminate Appearance'] == 1, 'target'] = 'Indeterminate Appearance'
train_df.loc[train_df['Atypical Appearance'] == 1, 'target'] = 'Atypical Appearance'
bar_plot(train_df, 'target') 

In [None]:
train_df.reset_index(inplace=True)
classes = [
    'Negative for Pneumonia',
    'Typical Appearance', 
    'Indeterminate Appearance', 
    'Atypical Appearance'
]
print('classes:\n', classes,
      '\nclasses labels:\n', np.unique(train_df[classes].values, axis=0))
label2classes = {
    '[1, 0, 0, 0]': classes[0],
    '[0, 1, 0, 0]': classes[1],
    '[0, 0, 1, 0]': classes[2],
    '[0, 0, 0, 1]': classes[3]
}
PARAMS['classes'] = classes
with open(f'{MDLS_PATH}/params.json', 'w') as file:
    json.dump(PARAMS, file)

# Data generator and helpers

In [None]:
#!g1.1
class DataGenSIIM(Sequence):
    
    def __init__(self, df, classes, imgs_path, imgs_idxs, img_size,
                 batch_size=8, mode='fit', shuffle=False, aug=None, 
                 resize=None, two_cls=False):
        self.df = df
        self.classes = classes
        self.imgs_path = imgs_path
        self.imgs_idxs = imgs_idxs
        self.img_size = img_size
        self.batch_size = batch_size
        self.mode = mode
        self.shuffle = shuffle
        self.aug = aug
        self.resize = resize
        self.two_cls = two_cls
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(len(self.imgs_idxs) / self.batch_size))
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.imgs_idxs))
        if self.shuffle:
            np.random.shuffle(self.indexes)
            
    def __getitem__(self, index):
        batch_size = min(self.batch_size, len(self.imgs_idxs) - index*self.batch_size)
        X = np.zeros((batch_size, self.img_size, self.img_size, 3), dtype=np.float32)
        imgs_batch = self.imgs_idxs[index * self.batch_size : (index+1) * self.batch_size]
        if self.mode == 'fit':
            if self.two_cls:
                y = np.zeros(batch_size, dtype=np.float32)
            else:
                y = np.zeros((batch_size, len(self.classes)), dtype=np.float32)
            for i, img_idx in enumerate(imgs_batch):
                X[i, ], y[i] = self.get_img(img_idx)
            return X, y
        elif self.mode == 'predict':
            for i, img_idx in enumerate(imgs_batch):
                X[i, ] = self.get_img(img_idx)
            return X
        else:
            raise AttributeError('fit mode parameter error')
            
    def get_img(self, img_idx):
        img_path = f'{self.imgs_path}/{img_idx}'
        img = cv2.imread(img_path)
        if img is None:
            print('error load image:', img_path)
        if self.resize:
            img = cv2.resize(img, (int(img.shape[1] / self.resize), int(img.shape[0] / self.resize)))
        img = img.astype(np.float32) / 255
        if self.mode == 'fit':
            if self.two_cls:
                label = self.df.loc[self.df['img'] == img_idx, 'None Opacity'].values[0]
            else:
                label = self.df.loc[self.df['img'] == img_idx, self.classes].values[0]
            if label is None:
                print('error load label:', img_path)
            label = label.astype(np.float32)
            if self.aug:
                img = self.aug(image=img)['image']
            return img, label
        else:
            if self.aug:
                img = self.aug(image=img)['image']
            return img

In [None]:
#!g1.1
if PARAMS['aughard']:
    aug = A.Compose([
        A.OneOf([
            A.RandomBrightness(limit=.2, p=1), 
            A.RandomContrast(limit=.2, p=1), 
            A.RandomGamma(p=1)
        ], p=.5),
        A.OneOf([
            A.Blur(blur_limit=3, p=1),
            A.MedianBlur(blur_limit=3, p=1)
        ], p=.25),
        A.OneOf([
            A.GaussNoise(0.002, p=.5),
            A.augmentations.geometric.transforms.Affine(p=.5) if YADSPH else A.IAAAffine(p=.5),
        ], p=.25),
        A.OneOf([
            A.ElasticTransform(alpha=120, sigma=120 * .05, alpha_affine=120 * .03, p=.5),
            A.GridDistortion(p=.5),
            A.OpticalDistortion(distort_limit=2, shift_limit=.5, p=1)                  
        ], p=.25),
        A.RandomRotate90(p=.5),
        A.HorizontalFlip(p=.5),
        A.VerticalFlip(p=.5),
        A.Cutout(num_holes=10, 
                 max_h_size=int(.1 * PARAMS['img_size']), max_w_size=int(.1 * PARAMS['img_size']), 
                 p=.25),
        A.ShiftScaleRotate(p=.5)
    ])
else:
    aug = A.Compose([
            A.OneOf([
                A.RandomBrightness(limit=.2, p=1), 
                A.RandomContrast(limit=.2, p=1), 
                A.RandomGamma(p=1)
            ], p=.5),
            A.HorizontalFlip(p=.5),
            A.ShiftScaleRotate(p=.25, rotate_limit=0)
        ])

In [None]:
#!g1.1
imgs_idxs = train_df.img.values
test_datagen = DataGenSIIM(
    df=train_df,
    classes=classes,
    imgs_path=IMGS_PATH, 
    imgs_idxs=imgs_idxs, 
    img_size=PARAMS['img_size'], 
    batch_size=PARAMS['batch_size'], 
    mode='fit', 
    shuffle=True,           
    aug=aug, 
    resize=None,
    two_cls=TWOCLS
)
bsize = min(4, PARAMS['batch_size'])
Xt, yt = test_datagen.__getitem__(0)
print('test X: ', Xt.shape)
print('test y: ', yt.shape)
fig, axes = plt.subplots(figsize=(16, 4), nrows=1, ncols=bsize)
for j in range(bsize):
    title = yt[j] if TWOCLS else classes[np.argmax(yt[j])]
    axes[j].imshow(Xt[j])
    axes[j].set_title(title)
    axes[j].axis('off')
plt.show()

In [None]:
#!g1.1
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, 
        efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, 
        efn.EfficientNetB6, efn.EfficientNetB7]

def get_model(params, classes=4, lr=.001, lbl_smth=.0001):
    input_shape=(params['img_size'], params['img_size'], 3)
    enet = EFNS[params['backbone']](
        input_shape=input_shape,
        weights='imagenet',
        include_top=False
    )
    inp = Input(shape=input_shape)
    x = enet(inp)
    x = GlobalAveragePooling2D()(x)
    x = Dense(64, activation = 'relu')(x)
    if classes == 1:
        x = Dense(classes, activation='sigmoid')(x)
        loss = BinaryCrossentropy(label_smoothing=params['lbl_smth'])
        auc = tf.keras.metrics.AUC(name='auc')
        accuracy = 'accuracy'
        f1  = tfa.metrics.F1Score(
            num_classes=classes, 
            average='macro', 
            threshold=None
        )
    else:
        x = Dense(classes, activation='softmax')(x)
        loss = CategoricalCrossentropy(label_smoothing=params['lbl_smth'])
        auc = AUC(name='auc', curve='ROC', multi_label=True)
        accuracy = CategoricalAccuracy()
        f1  = tfa.metrics.F1Score(
            num_classes=classes, 
            average='macro', 
            threshold=None
        )
    model = Model(inputs=inp, outputs=x)
    model.compile(
        optimizer=tfa.optimizers.Lookahead(
            tf.keras.optimizers.Adam(learning_rate=params['lr']),
            sync_period=max(6, int(params['patience'] / 4))
        ),
        loss=loss, 
        metrics=[auc, accuracy, f1]
    )
    return model

In [None]:
#!g1.1
def get_lr_callback(batch_size=10, epochs=100, warmup=5, plot=False):
    lr_start = 1e-5
    lr_max = 1e-3
    lr_min = lr_start / 100
    lr_ramp_ep = warmup
    lr_sus_ep = 0
    lr_decay = .95
    
    def lr_scheduler(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_decay ** (epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        return lr
        
    if not plot:
        lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler, verbose=False)
        return lr_callback 
    else: 
        return lr_scheduler
    
if PARAMS['decay']:
    lr_scheduler_plot = get_lr_callback(
        batch_size=PARAMS['batch_size'], 
        epochs=PARAMS['epochs'], 
        plot=True
    )
    xs = [i for i in range(PARAMS['epochs'])]
    y = [lr_scheduler_plot(x) for x in xs]
    plt.plot(xs, y)
    plt.title(f'lr schedule from {y[0]:.5f} to {max(y):.3f} to {y[-1]:.8f}')
    plt.show()

In [None]:
#!g1.1
def train_model(mparams, n_fold, train_datagen, val_datagen):
    model = get_model(
        mparams,
        classes=1 if TWOCLS else 4
    )
    checkpoint_path = f'{MDLS_PATH}/model_{n_fold}.hdf5'
    earlystopper = EarlyStopping(
        monitor=f'val_{mparams["metric"]}', 
        patience=mparams['patience'], 
        verbose=0,
        restore_best_weights=True,
        mode='max'
    )
    lrreducer = ReduceLROnPlateau(
        monitor=f'val_{mparams["metric"]}', 
        factor=.1, 
        patience=int(mparams['patience'] / 2), 
        verbose=0, 
        min_lr=1e-7,
        mode='max'
    )
    checkpointer = ModelCheckpoint(
        checkpoint_path, 
        monitor=f'val_{mparams["metric"]}', 
        verbose=0, 
        save_best_only=True,
        save_weights_only=True, 
        mode='max'
    )
    callbacks = [earlystopper, checkpointer]
    if mparams['decay']:
        callbacks.append(get_lr_callback(mparams['batch_size']))
        print('lr warmup and decay')
    else:
        callbacks.append(lrreducer)
        print('lr reduce on plateau')
    history = model.fit(
        train_datagen,
        validation_data=val_datagen,
        callbacks=callbacks,
        epochs=mparams['epochs'],
        verbose=1
    )
    history_file = f'{MDLS_PATH}/history_{n_fold}.json'
    dict_to_save = {}
    for k, v in history.history.items():
        dict_to_save.update({k: [np.format_float_positional(x) for x in history.history[k]]})
    with open(history_file, 'w') as file:
        json.dump(dict_to_save, file)
    model.load_weights(checkpoint_path)
    return model, history

# Train models

In [None]:
skf  = StratifiedKFold(n_splits=PARAMS['folds'])
train_df['fold'] = -1
if TWOCLS:
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y=train_df['None Opacity'])):
        train_df.loc[val_idx, 'fold'] = fold    
else:
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y=train_df.target)):
        train_df.loc[val_idx, 'fold'] = fold

In [None]:
#!g1.1
epoch_by_folds = []
loss_by_folds = []
metric_by_folds = []

if DEBUG:
    n_folds_train = 2
else:
    n_folds_train = PARAMS['folds'] if not PARAMS['folds_train'] else PARAMS['folds_train']
start_folds_train = 0

for fold_num in range(start_folds_train, n_folds_train):
    print('=' * 20, 'FOLD:', fold_num, '=' * 20)
    target = 'None Opacity' if TWOCLS else 'target'
    train_idxs = train_df.loc[train_df['fold'] != fold_num, 'img'].values
    print('-' * 30, f'\nTRAIN STATS: {len(train_idxs)}\n', 
          train_df.loc[train_df['fold'] != fold_num, target].value_counts())
    val_idxs = train_df.loc[train_df['fold'] == fold_num, 'img'].values
    print('-' * 30, f'\nVAL STATS: {len(val_idxs)}\n',
          train_df.loc[train_df['fold'] == fold_num, target].value_counts(),
          '\n', '-' * 30)
    train_datagen = DataGenSIIM(
        df=train_df,
        classes=classes,
        imgs_path=IMGS_PATH, 
        imgs_idxs=train_idxs, 
        img_size=PARAMS['img_size'], 
        batch_size=PARAMS['batch_size'], 
        mode='fit', 
        shuffle=True,           
        aug=aug, 
        resize=None,
        two_cls=TWOCLS
    )
    val_datagen = DataGenSIIM(
        df=train_df,
        classes=classes,
        imgs_path=IMGS_PATH, 
        imgs_idxs=val_idxs, 
        img_size=PARAMS['img_size'], 
        batch_size=PARAMS['batch_size'], 
        mode='fit', 
        shuffle=False,           
        aug=None, 
        resize=None,
        two_cls=TWOCLS
    )
    model, history = train_model(PARAMS, fold_num, train_datagen, val_datagen)
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.legend()
    plt.show()
    plt.plot(history.history[f'{PARAMS["metric"]}'], label=f'{PARAMS["metric"]}')
    plt.plot(history.history[f'val_{PARAMS["metric"]}'], label=f'val {PARAMS["metric"]}')
    plt.legend()
    plt.show()
    best_epoch = np.argmax(history.history[f'val_{PARAMS["metric"]}'])
    best_loss = history.history['val_loss'][best_epoch]
    best_metric = history.history[f'val_{PARAMS["metric"]}'][best_epoch]
    print('best epoch:', best_epoch, 
          '| best loss:', best_loss,
          f'| best {PARAMS["metric"]}:', best_metric)
    epoch_by_folds.append(best_epoch)
    loss_by_folds.append(best_loss)
    metric_by_folds.append(best_metric)
    del train_datagen, val_datagen, model; gc.collect()
    
elapsed_time = time.time() - start_time
print(f'time elapsed: {elapsed_time // 60:.0f} min {elapsed_time % 60:.0f} sec')

In [None]:
result = PARAMS.copy()
result['bavg_epoch'] = np.mean(epoch_by_folds)
result['bavg_loss'] = np.mean(loss_by_folds)
result[f'bavg_{PARAMS["metric"]}'] = np.mean(metric_by_folds)
result[f'{PARAMS["metric"]}_by_folds'] = ' '.join([f'{x:.4f}' for x in metric_by_folds])
result['classes'] = ', '.join(classes)
with open(f'{MDLS_PATH}/params.json', 'w') as file:
    json.dump(result, file)
if not os.path.exists('results.csv'):
    df_save = pd.DataFrame(result, index=[0])
    df_save.to_csv('results.csv', sep='\t')
else:
    df_old = pd.read_csv('results.csv', sep='\t', index_col=0)
    df_save = pd.DataFrame(result, index=[df_old.index.max() + 1])
    df_save = df_old.append(df_save, ignore_index=True)
    df_save.to_csv('results.csv', sep='\t')

In [None]:
pd.read_csv('results.csv', sep='\t', index_col=0)