# Info

- building a baseline model using the cats vs. dogs architecture.
- this baseline model includes a preprocessing steps of rescaling of all images to 448x448 size with a single channel (gray scale) and CLAHE with default parameters
- this baseline model is built for multi-label classification:
    - output layer has a Sigmoid activation function 
    - loss function is binary_crossentropy
    - chosen metrics is AUC with multi-label parameter set to True (as compatible with the competition requirements)

# Setup

## import libraries and modules

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from itertools import permutations
import warnings
import numpy as np
from pathlib import Path
import cv2
from collections import Counter
import seaborn as sns
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import warnings
import os
import cv2
import numpy as np
from random import shuffle
from pathlib import Path
from typing import Union
from multiprocessing import Pool
import pandas as pd
import tensorflow 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten,Conv2D, MaxPooling2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator 
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.metrics import AUC
from sklearn.model_selection import train_test_split


%matplotlib inline

## set constants

In [21]:
TRAIN_META = 'train.csv'

TRAIN_IMG_DIR = Path('train') 
TRAIN_RESIZE_DIR = Path('train_resize') 

TEST_IMG_DIR = Path('test') 
TEST_RESIZE_DIR = Path('test_resize') 


CASE = 'StudyInstanceUID'
NEW_SIZE = (448,448)
IMG_SIZE = (448, 448, 1)
RANDOM_STATE = 42
VALIDATION_SIZE = 0.2

EXPECTED_X_SHAPE = (23262, 80, 80, 3)
TEST_VALIDATION_SIZE = 0.1
RANDOM_STATE = 42
CATS_LABEL = 0
DOGS_LABEL = 1
IMG_RESIZE = (80, 80)
RSCL = 1/255
ACTIVATION = 'relu'
N_FILTERS = 64
FILTER2D_size = 1
METRICS = 'auc'
DENSE_DIM = 64
OUT_DIM = 1
OUT_ACTIVATION = 'sigmoid'
OPTIMIZER = 'adam'
LOSS = 'binary_crossentropy'
EPOCHS = 10
BATCH_SIZE = 32
AVAIL_CPU = None

np.random.seed(RANDOM_STATE)

## load meta file

In [4]:
df = pd.read_csv(TRAIN_META)

## get labels

In [5]:
labels = df.select_dtypes(int).columns

## functions

In [6]:
def show_img(img: np.array, ax=None, title: str='', cmap: str = 'gray'):
    _, ax = plt.subplots() 
    ax.imshow(img, cmap=cmap)
    ax.set_title(title)
    
def preprocess(img_path: Path, processed_path: Path):
    new_img = cv2.resize(cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE), NEW_SIZE)
    cv2.imwrite(str(processed_path), new_img)
    

def validate_file(record: Path):
    if record.is_file() & record.exists():
        return True
    return False


def numipy_train(meta: pd.DataFrame, img_dir: Path,
                 suffix: str = '.jpg', case_col: str = CASE, 
                 labels_col = labels, 
                 scale=cv2.IMREAD_GRAYSCALE, 
                 image_dir_temp_col = 'images', cpu: int=None):
    
    meta[image_dir_temp_col] = img_dir / (meta[case_col] + suffix)
    msk = meta[image_dir_temp_col].apply(validate_file)
    meta = meta[msk]
    images, label_values = meta[image_dir_temp_col].to_list(), meta[labels_col].values
    images = [(str(image), scale) for image in images]
    with Pool(cpu) as p: images = p.starmap(cv2.imread, images)
    return np.array(images), label_values

def numipy_test(img_dir: Path, labels: list, 
                 suffix: str = '.jpg', case_col: str = CASE, 
                 labels_col = labels, 
                 scale=cv2.IMREAD_GRAYSCALE, 
                 image_dir_temp_col = 'images', cpu: int=None)
    
    images = list(img_dir.glob('*' + suffix))
    images = [(str(image), scale) for image in images]
    with Pool(cpu) as p: images = p.starmap(cv2.imread, images)
    

def multi_preproccess(input_dir: Path, output_dir: Path, glob: str='*.jpg', 
                      size: tuple=NEW_SIZE, scale=cv2.IMREAD_GRAYSCALE, 
                      clip_limit=None, title_grid_size=None, 
                      cpu: int=None):
    pool_lst = []
    os.makedirs(output_dir, exist_ok=True)
    for input_img in input_dir.glob(glob):
        pool_lst.append((input_img, output_dir/input_img.name, size, scale, clip_limit, title_grid_size))
    with Pool(cpu) as p: p.starmap(preprocess, pool_lst)


# Preprocess

## preprocess train

In [7]:
multi_preproccess(TRAIN_IMG_DIR, TRAIN_RESIZE_DIR)

## preprocess test

In [8]:
multi_preproccess(TEST_IMG_DIR, TEST_RESIZE_DIR)

## convert train into numpy array

In [9]:
X, y = numipy_train(df, TRAIN_RESIZE_DIR)

# basic CNN

## make sequential model architecture

In [10]:
model = Sequential([Rescaling(RSCL, input_shape=IMG_SIZE, name='rescaling'),
                    Conv2D(N_FILTERS, FILTER2D_size, activation=ACTIVATION, name='conv_1'), 
                    MaxPooling2D(name='max_pool1'),  
                    Conv2D(N_FILTERS, FILTER2D_size, activation=ACTIVATION, name='conv_2'), 
                    MaxPooling2D(name='max_pool2'), 
                    Conv2D(N_FILTERS, FILTER2D_size, activation=ACTIVATION, name='conv_3'),
                    MaxPooling2D(name='max_pool3'), 
                    Flatten(name='flat'), 
                    Dense(DENSE_DIM, activation=ACTIVATION, name='dense_1'), 
                    Dense(len(labels), activation=OUT_ACTIVATION, name='out')])

## model summary

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
rescaling (Rescaling)        (None, 448, 448, 1)       0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 448, 448, 64)      128       
_________________________________________________________________
max_pool1 (MaxPooling2D)     (None, 224, 224, 64)      0         
_________________________________________________________________
conv_2 (Conv2D)              (None, 224, 224, 64)      4160      
_________________________________________________________________
max_pool2 (MaxPooling2D)     (None, 112, 112, 64)      0         
_________________________________________________________________
conv_3 (Conv2D)              (None, 112, 112, 64)      4160      
_________________________________________________________________
max_pool3 (MaxPooling2D)     (None, 56, 56, 64)        0

## Callbacks

In [12]:
checkpoint = ModelCheckpoint("Checkpoint/weights.{epoch:02d}-{val_loss:.2f}.hdf5", monitor='val_loss', verbose=1, 
save_best_only=False, save_weights_only=False, mode='auto')
callback = EarlyStopping(monitor='val_loss', patience=2)

## compile model

In [13]:
model.compile(optimizer=OPTIMIZER, loss=LOSS, metrics=AUC(multi_label=True))

# train

In [19]:
model.fit(X_train, y_train, validation_split=TEST_VALIDATION_SIZE, batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[checkpoint, callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff9c6f3e6d0>

In [40]:
np.mean(np.where(model.predict(X_train[:1000]) > 0.5, 1, 0) == y_train[:1000], axis=0)

array([0.997, 0.969, 0.867, 0.992, 0.975, 0.921, 0.895, 0.883, 0.745,
       0.749, 0.978])

In [15]:
import tensorflow as tf 

if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Please install GPU version of TF


In [16]:
tf.test.gpu_device_name()

''

In [17]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [18]:
import tensorflow as tf
tf.config.list_physical_devices()


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
