In [None]:
import os
import json
import cv2
import time
import shutil
from PIL import Image
import pandas as pd
import numpy as np
import pydicom
import glob
from pydicom.pixel_data_handlers.util import apply_voi_lut
import tensorflow as tf
import tensorflow_addons as tfa
import matplotlib.pyplot as plt
import albumentations as A
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
from tensorflow.keras import Model, Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import Sequence
from tensorflow.keras.losses import binary_crossentropy, CategoricalCrossentropy
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import *
from tensorflow.keras.metrics import AUC, CategoricalAccuracy
from tqdm import tqdm
import efficientnet.tfkeras as efn
print('tensorflow version:', tf.__version__)

In [None]:
KAGGLE = False
MDLS_FOLDS = {'v0': [0, 1, 2, 3]}
if KAGGLE:
    DATA_PATH = '../input/siim-covid19-detection'
    MDLS_PATHS = {ver: f'../input/siim-tfmodels-{ver}' 
                  for ver, _ in MDLS_FOLDS.items()}
else:
    DATA_PATH = './data'
    MDLS_PATHS = {ver: f'./models_{ver}' 
                  for ver, _ in MDLS_FOLDS.items()}
CACHE_PATHS = {ver: f'./cache_{ver}' for ver, _ in MDLS_FOLDS.items()}
TTAS = [0, 1, 2]

start_time = time.time()

In [None]:
params_dict = {}
for ver, _ in MDLS_FOLDS.items():
    with open(f'{MDLS_PATHS[ver]}/params.json') as file:
        params_dict[ver] = json.load(file)
for ver, params in params_dict.items():
    print('version:', ver, '| loaded params:', params_dict, '\n')

In [None]:
def read_xray(path, voi_lut=True, fix_monochrome=True):
    dicom = pydicom.read_file(path)
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to 
    # "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom.pixel_array, dicom)
    else:
        data = dicom.pixel_array        
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    data = (data * 255).astype(np.uint8) 
    return data

def resize(array, size, keep_ratio=False, resample=Image.LANCZOS):
    img = Image.fromarray(array)
    if keep_ratio:
        img.thumbnail((size, size), resample)
    else:
        img = img.resize((size, size), resample)
    return img

In [None]:
filepaths = glob.glob(f'{DATA_PATH}/test/**/*dcm', recursive=True)
test_df = pd.DataFrame({'path': filepaths,})
test_df['image_id'] = test_df.path.map(
    lambda x: x.split('/')[-1].replace('.dcm', '') 
    + '_image'
)
test_df['study_id'] = test_df.path.map(
    lambda x: x.split('/')[-3].replace('.dcm', '') 
    + '_study'
)
test_df.head()

In [None]:
for ver, params in params_dict.items():
    counter = 0
    images_paths = []
    dim_x = []
    dim_y = []
    os.makedirs(CACHE_PATHS[ver], exist_ok=True)
    for file in tqdm(test_df.path, desc=f'test {ver}'):
        if file == '':
            counter += 1
        else:
            xray = read_xray(file)
            im = resize(xray, size=params['img_size']) # keep_ratio=True to have original aspect ratio
            im.save(CACHE_PATHS[ver] + '/' + file.split('/')[-1].replace('dcm', 'png'))
            images_paths.append(file.split('/')[-1].replace('dcm', 'png'))
            dim_x.append(xray.shape[1])
            dim_y.append(xray.shape[0])
    print('files omitted:', counter)

In [None]:
test_df['img'] = images_paths
test_df['dim_x'] = dim_x
test_df['dim_y'] = dim_y
test_df

In [None]:
EFNS = [efn.EfficientNetB0, efn.EfficientNetB1, 
        efn.EfficientNetB2, efn.EfficientNetB3, 
        efn.EfficientNetB4, efn.EfficientNetB5, 
        efn.EfficientNetB6, efn.EfficientNetB7]

def get_model(params, classes=4, lr=.001, lbl_smth=.0001):
    input_shape=(params['img_size'], params['img_size'], 3)
    enet = EFNS[params['backbone']](
        input_shape=input_shape,
        weights=None,
        include_top=False
    )
    inp = Input(shape=input_shape)
    x = enet(inp)
    x = GlobalAveragePooling2D()(x)
    x = Dense(64, activation = 'relu')(x)
    x = Dense(classes, activation='softmax')(x)
    model = Model(inputs=inp, outputs=x)
    loss = CategoricalCrossentropy(label_smoothing=params['lbl_smth'])
    auc = AUC(name='auc', curve='ROC', multi_label=True)
    accuracy = CategoricalAccuracy()
    f1  = tfa.metrics.F1Score(
        num_classes=classes, 
        average='macro', 
        threshold=None
    )
    model.compile(
        optimizer=tfa.optimizers.Lookahead(
            tf.keras.optimizers.Adam(learning_rate=params['lr']),
            sync_period=max(6, int(params['patience'] / 4))
        ),
        loss=loss, 
        metrics=[auc, accuracy, f1]
    )
    return model

In [None]:
class DataGenSIIM(Sequence):
    
    def __init__(self, df, classes, imgs_path, imgs_idxs, img_size,
                 batch_size=8, mode='fit', shuffle=False, aug=None, 
                 resize=None, tta=0):
        self.df = df
        self.classes = classes
        self.imgs_path = imgs_path
        self.imgs_idxs = imgs_idxs
        self.img_size = img_size
        self.batch_size = batch_size
        self.mode = mode
        self.shuffle = shuffle
        self.aug = aug
        self.resize = resize
        self.tta = tta
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.ceil(len(self.imgs_idxs) / self.batch_size))
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.imgs_idxs))
        if self.shuffle:
            np.random.shuffle(self.indexes)
            
    def __getitem__(self, index):
        batch_size = min(self.batch_size, len(self.imgs_idxs) - index*self.batch_size)
        X = np.zeros((batch_size, self.img_size, self.img_size, 3), dtype=np.float32)
        imgs_batch = self.imgs_idxs[index * self.batch_size : (index+1) * self.batch_size]
        if self.mode == 'fit':
            y = np.zeros((batch_size, len(self.classes)), dtype=np.float32)
            for i, img_idx in enumerate(imgs_batch):
                X[i, ], y[i] = self.get_img(img_idx)
            return X, y
        elif self.mode == 'predict':
            for i, img_idx in enumerate(imgs_batch):
                X[i, ] = self.get_img(img_idx)
            return X
        else:
            raise AttributeError('fit mode parameter error')
            
    def get_img(self, img_idx):
        img_path = f'{self.imgs_path}/{img_idx}'
        img = cv2.imread(img_path)
        if img is None:
            print('error load image:', img_path)
        if self.resize:
            img = cv2.resize(img, (int(img.shape[1] / self.resize), int(img.shape[0] / self.resize)))
        img = img.astype(np.float32) / 255
        if self.mode == 'fit':
            label = self.df.loc[self.df['img'] == img_idx, self.classes].values[0]
            if label is None:
                print('error load label:', img_path)
            label = label.astype(np.float32)
            if self.aug:
                img = self.aug(image=img)['image']
            return img, label
        else:
            if self.aug:
                img = self.aug(image=img)['image']
            img = self.flip(img, axis=self.tta)
            return img
        
    def flip(self, img, axis=0):
        if axis == 1:
            return img[::-1, :, ]
        elif axis == 2:
            return img[:, ::-1, ]
        elif axis == 3:
            return img[::-1, ::-1, ]
        else:
            return img

In [None]:
models = []
for ver, folds in MDLS_FOLDS.items():
    for n_fold in folds:
        checkpoint_path = f'{MDLS_PATHS[ver]}/model_{n_fold}.hdf5'
        model = get_model(
            params_dict[ver]
        )
        model.load_weights(checkpoint_path)
        models.append(model)
        print('ver:', ver, '| model loaded:', checkpoint_path)

In [None]:
imgs_idxs = test_df.img.values
test_datagen = DataGenSIIM(
    df=test_df,
    classes=params_dict['v0']['classes'],
    imgs_path=CACHE_PATHS['v0'], 
    imgs_idxs=imgs_idxs, 
    img_size=320, 
    batch_size=100, 
    mode='predict', 
    shuffle=False,           
    aug=None, 
    resize=None,
    tta=0
)
bsize = min(4, 8)
Xt = test_datagen.__getitem__(0)
print('test X: ', Xt.shape)
fig, axes = plt.subplots(figsize=(16, 4), nrows=1, ncols=bsize)
for j in range(bsize):
    axes[j].imshow(Xt[j])
    axes[j].axis('off')
plt.show()

In [None]:
BATCH_SUBM = 64
preds = []
for ver, folds in MDLS_FOLDS.items():
    models = []
    for n_fold in folds:
        checkpoint_path = f'{MDLS_PATHS[ver]}/model_{n_fold}.hdf5'
        model = get_model(
            params_dict[ver]
        )
        model.load_weights(checkpoint_path)
        models.append(model)
        print('ver', ver, '-> model loaded', checkpoint_path)
    for tta in TTAS:
        print(f'ver {ver} classes {params_dict[ver]["classes"]}')
        test_datagen = DataGenSIIM(
            df=test_df,
            classes=params_dict[ver]['classes'],
            imgs_path=CACHE_PATHS[ver], 
            imgs_idxs=imgs_idxs, 
            img_size=params_dict[ver]['img_size'], 
            batch_size=BATCH_SUBM, 
            mode='predict', 
            shuffle=False,           
            aug=None, 
            resize=None,
            tta=tta
        )
        for i, model in enumerate(models):
            preds.append(model.predict(test_datagen))
            print(f'ver {ver} | tta {tta} | model {i} -> prediction done')
preds = np.array(np.mean(preds, axis=0))
print('all done | preds shape:', preds.shape)

In [None]:
name2fname = {
    'Negative for Pneumonia': 'negative', 
    'Typical Appearance': 'typical', 
    'Indeterminate Appearance': 'indeterminate', 
    'Atypical Appearance': 'atypical'
    
}
name2label = {v: i for i, (k, v) in enumerate(name2fname.items())}
print(name2label)
label2name  = {v:k for k, v in name2label.items()}
print(label2name)

In [None]:
cols_classes = [str(x) for x in list(name2label.values())]
for i, col in enumerate(cols_classes):
    test_df[col] = preds[:, i]
test_df.head()

In [None]:
study_df = test_df.groupby(['study_id'])[cols_classes].mean().reset_index()
study_df.rename(columns={'study_id':'id'}, inplace=True)
study_df.head()

In [None]:
def get_predstring(row, thr=0):
    string = ''
    for idx in range(4):
        conf =  row[str(idx)]
        if conf > thr:
            string += f'{label2name[idx]} {conf:0.2f} 0 0 1 1 '
    string = string.strip()
    return string

In [None]:
study_df['PredictionString'] = study_df.apply(get_predstring, axis=1)
study_df = study_df.drop(cols_classes, axis=1)
study_df.head()

In [None]:
image_df = pd.DataFrame({
    'id':test_df.image_id.tolist(),
    'PredictionString':["none 1 0 0 1 1"]*len(test_df.image_id.tolist())
})
image_df.head()

In [None]:
subm_df = pd.concat([study_df, image_df])
subm_df.to_csv('submission.csv', index=False)
display(subm_df.head())
print('submission done:', subm_df.shape)

In [None]:
for ver, cache_path in CACHE_PATHS.items():
    shutil.rmtree(cache_path)
    print(f'ver {ver} | path {cache_path} -> cache deleted')