In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

MASTER_FILE = os.path.join( '..', 'data', 'training_set.csv' )
DATA_DIR = os.path.join( '..', 'data', 'train' )
RESULTS_DIR = os.path.join( 'results' )

## PARAMS
IMAGE_SIZE = 224

2023-09-07 02:04:39.290920: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 01. Data Management:

Load the master file with the information per patient:


In [None]:
master_data = pd.read_csv( MASTER_FILE )[ ['file', 'label', 'patient_id'] ]
master_data = master_data.groupby('patient_id').agg({'file': list, 'label': lambda x: np.unique(x)[0] })
master_data = master_data.to_dict(orient='index')
print("Number of patients:", len(master_data) )

Number of patients: 12086


Write a custom pipeline to load data:

In [None]:
CLASS_NAMES = np.array( ['N', 'P', 'T'] )
autotune = tf.data.AUTOTUNE

def get_data_from_ids( split_ids ):
    """ Get master files for a given set of ids """
    split_data = []
    for split_id in split_ids:
        for file_id in master_data[split_id]['file']:
            file_path = os.path.join( DATA_DIR, file_id )
            split_data.append( f"{master_data[split_id]['label']}-{file_path}" )
    return split_data

def load_image( image_path, img_size=(IMAGE_SIZE, IMAGE_SIZE)):
    """ Loads a random image for the patient"""
    img = tf.io.read_file( image_path )
    img = tf.io.decode_image( img, channels=3, expand_animations=False )
    img = tf.image.resize( img, img_size )
    return img

def preprocess_patient_path( patient_data ):
    ''' Load patient image and patient label'''
    parts = tf.strings.split( patient_data, '-')
    
    # get label
    one_hot = parts[0] == CLASS_NAMES
    patient_label = tf.cast(one_hot, dtype=tf.int8)

    # get image
    patient_image = load_image( parts[1] )
    return patient_image, patient_label

def define_dataset(train_ids, valid_ids):
    """ Init and preprocess the datasets """

    # get master data into the train and valid data
    train_data = get_data_from_ids(train_ids)
    val_data = get_data_from_ids(valid_ids)

    # load the labels file
    train_ds = tf.data.Dataset.from_tensor_slices( train_data )
    val_ds = tf.data.Dataset.from_tensor_slices( val_data )

    # add shuffling
    train_ds = train_ds.shuffle(buffer_size=len(train_data), reshuffle_each_iteration=True )
    val_ds = val_ds.shuffle(buffer_size=len(val_data), reshuffle_each_iteration=True )

    # add the patient data to the datasets
    train_ds = train_ds.map( preprocess_patient_path, num_parallel_calls=autotune)
    val_ds = val_ds.map( preprocess_patient_path, num_parallel_calls=autotune)
    return train_ds, val_ds

def visualize_data_samples( dataset, sample_size=3):
    """ Visualize samples of the set"""
    fig, ax = plt.subplots(1, sample_size, figsize=(15, 5))
    for i, (sample_image, sample_label) in enumerate( dataset.take(sample_size) ):
        ax[i].imshow(sample_image / 255 )
        ax[i].set_title( CLASS_NAMES[np.argmax(sample_label.numpy())] )
    plt.show();
    return None


#### DEFINE SOME VARIABLES
X_data = np.array( list(master_data.keys()) )
Y_data = np.array( [ master_data[pid]['label'] for pid in X_data ] )

## 02. Metrics and Evaluation:

Custom metrics we are going to track:

In [None]:
def macro_f1(y_true, y_pred):
    """ For how the data is organized"""

    def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

Full evaluation of the model:

In [None]:
from sklearn.metrics import f1_score, confusion_matrix

def evaluate_model(dataset, model):
    """ 
    Calculate the main stats for the model:
    - F1 Score (for each class)
    - ACC Score (for each class)
    - Confusion Matrix
    """

    # get the results for the whole dataset
    y_true, y_pred = [], []
    for batch_images, batch_labels in dataset:
        model_outputs = tf.argmax( cnn_model( batch_images ), axis=1)
        true_values = tf.argmax( batch_labels, axis=1)
        y_pred.append( model_outputs )
        y_true.append( true_values )
    y_true = tf.concat(y_true, axis=0).numpy()
    y_pred = tf.concat(y_pred, axis=0).numpy()

    # calculate statistics for the data
    f1_scores = f1_score( y_true=y_true, y_pred=y_pred, average=None)
    conf_matrix = confusion_matrix( y_true=y_true, y_pred=y_pred, normalize='true' )

    # compute acc for each class
    normalized_cm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    acc_scores = normalized_cm.diagonal()
    
    # show the metrics
    print( "F1 Scores:", {CLASS_NAMES[i]: f1_scores[i] for i in range(len(CLASS_NAMES))} )
    print( "Acc Scores:", {CLASS_NAMES[i]: acc_scores[i] for i in range(len(CLASS_NAMES))} )
    print()

    # confusion matrix display
    df_cm = pd.DataFrame(conf_matrix, index = [i for i in CLASS_NAMES], columns = [i for i in CLASS_NAMES])
    plt.figure(figsize = (10,7))
    sns.heatmap(df_cm, annot=True)
    plt.show();

    # create the results data
    results = {}
    for metric_name, metric_values in zip( ['f1', 'acc'], [f1_scores, acc_scores] ):
        for i, class_name in enumerate(CLASS_NAMES):
            results[ f'{metric_name}_{class_name}'] = metric_values[i]
    return results


## 03. Model definition:

In [None]:
def build_cnn_model( n_head_layers, num_classes=3 ):
    
    # preprocess backbone model
    preprocessing_block = tf.keras.Sequential([
        layers.Flatten(),
        layers.BatchNormalization()
    ], name='ConvProcessing')
    
    # define a classification head
    head_layers = []
    for i in range(n_head_layers):
        head_block = tf.keras.Sequential([
            layers.Dense(2**(6 + i), activation='relu'),
            layers.Dropout(0.5),
            layers.BatchNormalization(),
        ])
        head_layers.append( head_block )
    head_layers = head_layers[::-1]
    head_layers.append( layers.Dense( num_classes ) )
    classification_head = tf.keras.Sequential(head_layers, name='ClassHead')
    
    # build the model
    input_image = layers.Input([IMAGE_SIZE, IMAGE_SIZE, 3], name='InputLayer')
    x = layers.Lambda( keras.applications.resnet50.preprocess_input, name='Preprocessing')(input_image)
    
    # add resnet backbone
    x = keras.applications.ResNet50(
        input_tensor = x,
        include_top=False,
        weights='imagenet'
    ).output
    x = preprocessing_block(x)
    output_label = classification_head(x)
    
    return tf.keras.Model(inputs=[input_image], outputs=[output_label], name='ResNetBackbone')

def config_model_for_head_training( model ):
    """ Freezes the convolutional layers of the backbone"""
    for layer in model.layers[2].layers:
        layer.trainable = False
    return model

def config_model_for_fine_tuning( n_unfrozen_layers, model ):
    """ Unfreezes the last part of the conv backbone """
    for layer in model.layers[2].layers[-n_unfrozen_layers:]:
        layer.trainable = True
    return model

def plot_model_training_hist( history ):
    # plot the training history
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))

    # plot accuracies
    ax[0].set_title('Accuracy Metric')
    ax[0].plot( history.history['accuracy'], label='Train' )
    ax[0].plot(history.history['val_accuracy'], label='Valid')
    ax[0].set_ylabel('Accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].legend(loc='best')

    # plot f1 scores
    ax[1].set_title('F1-Score Metric')
    ax[1].plot( history.history['macro_f1'], label='Train' )
    ax[1].plot(history.history['val_macro_f1'], label='Valid')
    ax[1].set_ylabel('F1-Score')
    ax[1].set_xlabel('Epoch')
    ax[1].legend(loc='best')
    plt.show()
    return None



## 04. Training:
Cross validation for the patients:

In [None]:
from sklearn.model_selection import StratifiedKFold
from itertools import product

LOSS_FUNCT_TO_USE = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
METRICS_TO_TRACK = [ 'accuracy', macro_f1 ]

## TRAINING PARAMS
BATCH_SIZE = 64

## TRAINING PHASES
EPOCHS_HEAD_TRAINING = 10
EPOCHS_FINE_TUNING = 5

## MODEL PARAMS
HEAD_SIZES = [ 2, 3, 4 ]
UNFROZEN_LAYERS = [ 5, 10 ]

## RESULTS
training_results = []

for n_head_layers, n_unfrozen_layers in product(HEAD_SIZES, UNFROZEN_LAYERS):
    param_results = []

    skf = StratifiedKFold(n_splits=3)
    for i, (train_idx, valid_idx) in enumerate(skf.split(X_data, Y_data)):
        callback_head = keras.callbacks.EarlyStopping('val_macro_f1', mode='max', restore_best_weights=True, verbose=1, patience=2)
        callback_fine = keras.callbacks.EarlyStopping('val_macro_f1', mode='max', restore_best_weights=True, verbose=1, patience=2)

        print("=" * 100)
        print(f"\t Head size: {n_head_layers} ; Unfrozen Layer: {n_unfrozen_layers} || Training for Fold", i+1)
        print("=" * 100)

        # get the ids and build the datasets
        train_ids, valid_ids = X_data[train_idx], X_data[valid_idx]
        train_ds, valid_ds = define_dataset(train_ids, valid_ids)
    
        # visualize samples
        visualize_data_samples( train_ds )
        print("=" * 100)

        # batch data
        train_ds = train_ds.batch(BATCH_SIZE)
        valid_ds = valid_ds.batch(BATCH_SIZE)

        # compute the class_weights in the training data
        Y_train = Y_data[train_idx]
        train_weigths = class_weight.compute_class_weight( 'balanced', classes=CLASS_NAMES, y=Y_train )
        train_weigths_dict = dict(enumerate(train_weigths)) 

        # build the model
        cnn_model = build_cnn_model( n_head_layers, num_classes=3 )

        with tf.device('/device:GPU:0'):
    
            # Training ---- Part 1: Training Classification Head
            print( " - Training Classification Head..." )
            cnn_model = config_model_for_head_training( cnn_model )
            cnn_model.compile(optimizer='adam', loss=LOSS_FUNCT_TO_USE, metrics=METRICS_TO_TRACK)
        
            history_head = cnn_model.fit(
                train_ds,
                validation_data=valid_ds,
                epochs=EPOCHS_HEAD_TRAINING,
                class_weight=train_weigths_dict,
                callbacks=[callback_head]
            )
            plot_model_training_hist(history_head)
            print("=" * 100)
        
            # Training ---- Part 2: Fine Tuning
            print( " - Training Fine Tuning..." )
            cnn_model = config_model_for_fine_tuning( n_unfrozen_layers, cnn_model )
            cnn_model.compile(optimizer='adam', loss=LOSS_FUNCT_TO_USE, metrics=METRICS_TO_TRACK)
        
            history_fine = cnn_model.fit(
                train_ds,
                validation_data=valid_ds,
                epochs=EPOCHS_FINE_TUNING,
                class_weight=train_weigths_dict,
                callbacks=[callback_fine]
            )
    
            plot_model_training_hist(history_fine)
            print("=" * 100)

        # get and store the results
        fold_results = evaluate_model(valid_ds, cnn_model)
        fold_best_epoch_head = np.argmax( history_head.history['val_macro_f1'] ) + 1
        fold_best_epoch_fine = np.argmax( history_fine.history['val_macro_f1'] ) + 1
        fold_results['best_epoch_head'] = fold_best_epoch_head
        fold_results['best_epoch_fine'] = fold_best_epoch_fine

        param_results.append(fold_results)
        del cnn_model, callback_head, callback_fine

    # calculate the training results
    param_results = pd.DataFrame.from_records(param_results)
    cv_results = param_results.mean(axis=0).to_dict()

    # add the parameters / 
    cv_results['head_layers'] = n_head_layers
    cv_results['unfrozen_layers'] = n_unfrozen_layers
    training_results.append( cv_results )

training_results = pd.DataFrame.from_records( training_results )
training_results.to_csv( os.path.join(RESULTS_DIR, 'resnet_cnn.csv') )

training_results

Output hidden; open in https://colab.research.google.com to view.