# Model predictions for the multitask model

In [2]:
# Imports
import pandas as pd
import numpy as np
import ast
import re
import statistics
from sklearn.metrics import roc_auc_score

In [28]:
## Loading the data

# Model predictions
MT_TD = pd.read_csv('Predictions/MT_preds_PadChest_TD.csv', index_col=0)
MT_PD = pd.read_csv('Predictions/MT_preds_PadChest_PD.csv', index_col=0)

# True labels
test_padchest = pd.read_csv('../Data/Data_splits/pathology_detection-test.csv', index_col=0)
annotations = pd.read_csv('../Data/Annotations/Annotations_aggregated.csv', index_col=0)
padchest_test_labels_ALL = pd.concat([test_padchest, annotations])   # Concatenating the tube and pathology test sets

ann_labels = pd.read_csv('../Data/Data_splits/tube_detection-test.csv', index_col=0)

## Area Under the ROC Curve (AUC)

In [52]:
# Function for reading the predictions, which are strings, as numpy arrays
def str2array(s):
    # Remove space after [
    s=re.sub('\[ +', '[', s.strip())
    # Replace commas and spaces
    s=re.sub('[,\s]+', ', ', s)
    return np.array(ast.literal_eval(s))


# Function to arrange preds nicely in a df
def get_preds_multiclass_PD(orig_pred_df, true_labels_df, print_auc=True):
    
    pathologies = ['Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']
    all_preds = []
    
    for row_number in range(len(orig_pred_df)):
        for p_idx, p in enumerate(pathologies):
            preds = [[str2array(i["Preds_model1"]) for idx, i in orig_pred_df.iterrows()][row_number][:,p_idx],
                     [str2array(i["Preds_model2"]) for idx, i in orig_pred_df.iterrows()][row_number][:,p_idx],
                     [str2array(i["Preds_model3"]) for idx, i in orig_pred_df.iterrows()][row_number][:,p_idx]]
            all_preds.append(preds)
            
    # Constructing a df with the preds and 'true' labels
    preds_df = pd.DataFrame(list(zip(list(true_labels_df['Effusion']),
                                     list(true_labels_df['Pneumothorax']),
                                     list(true_labels_df['Atelectasis']),
                                     list(true_labels_df['Cardiomegaly']),
                                     list(true_labels_df['Pneumonia']),
                                     list(all_preds[0][0]),
                                     list(all_preds[0][1]),
                                     list(all_preds[0][2]),
                                     list(all_preds[1][0]),
                                     list(all_preds[1][1]),
                                     list(all_preds[1][2]),
                                     list(all_preds[2][0]),
                                     list(all_preds[2][1]),
                                     list(all_preds[2][2]),
                                     list(all_preds[3][0]),
                                     list(all_preds[3][1]),
                                     list(all_preds[3][2]),
                                     list(all_preds[4][0]),
                                     list(all_preds[4][1]),
                                     list(all_preds[4][2]))),
                            columns = ['Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly',
                                       'Pneumonia', 'preds_Effusion_model1', 'preds_Effusion_model2', 'preds_Effusion_model3',
                                       'preds_Pneumothorax_model1', 'preds_Pneumothorax_model2', 'preds_Pneumothorax_model3',
                                       'preds_Atelectasis_model1', 'preds_Atelectasis_model2', 'preds_Atelectasis_model3',
                                       'preds_Cardiomegaly_model1', 'preds_Cardiomegaly_model2', 'preds_Cardiomegaly_model3',
                                       'preds_Pneumonia_model1', 'preds_Pneumonia_model2', 'preds_Pneumonia_model3'])


    if print_auc:
        # Computing the auc for each pathology separately
        for p in pathologies:
            print(p)
            auc_list = []
            for i in range(3):
                #print(i+1)
                auc = roc_auc_score(preds_df[p], preds_df['preds_' + str(p) + '_model' + str(i+1)])
                auc_list.append(auc)
            #print(auc_list)

    #        print("Average AUC:", round(sum(auc_list)/3 * 100, 5))
            print("Average auc:", round(sum(auc_list)/len(auc_list)*100, 1), "with standard deviation:", round(statistics.stdev(auc_list)*100,1))
            print()

    return preds_df

# Function to arrange preds nicely in a df
def get_preds_multiclass_TD(orig_pred_df, true_labels_df, print_auc=True):
    tube_types = ['Chest_drain_tube', 'NSG_tube', 'Endotracheal_tube', 'Tracheostomy_tube']
    all_preds = []
    
    for row_number in range(len(orig_pred_df)):
        for t_idx, tube in enumerate(tube_types):
            preds = [[str2array(i["Preds_model1"]) for idx, i in orig_pred_df.iterrows()][row_number][:,t_idx],
                     [str2array(i["Preds_model2"]) for idx, i in orig_pred_df.iterrows()][row_number][:,t_idx],
                     [str2array(i["Preds_model3"]) for idx, i in orig_pred_df.iterrows()][row_number][:,t_idx]]
            all_preds.append(preds)

            
    # Constructing a df with the preds and 'true' labels
    preds_df = pd.DataFrame(list(zip(list(true_labels_df['Chest_drain_Ann']),
                                     list(true_labels_df['NSG_tube_Ann']),
                                     list(true_labels_df['Endotracheal_tube_Ann']),
                                     list(true_labels_df['Tracheostomy_tube_Ann']),
                                     list(true_labels_df['Chest_drain_tube']),
                                     list(true_labels_df['NSG_tube']),
                                     list(true_labels_df['Endotracheal_tube']),
                                     list(true_labels_df['Tracheostomy_tube']),
                                     list(all_preds[0][0]),
                                     list(all_preds[0][1]),
                                     list(all_preds[0][2]),
                                     list(all_preds[1][0]),
                                     list(all_preds[1][1]),
                                     list(all_preds[1][2]),
                                     list(all_preds[2][0]),
                                     list(all_preds[2][1]),
                                     list(all_preds[2][2]),
                                     list(all_preds[3][0]),
                                     list(all_preds[3][1]),
                                     list(all_preds[3][2]))),
                            columns = ['Chest_drain_Ann', 'NSG_tube_Ann', 'Endotracheal_tube_Ann', 'Tracheostomy_tube_Ann',
                                       'Chest_drain_tube_PadChest', 'NSG_tube_PadChest', 'Endotracheal_tube_PadChest', 'Tracheostomy_tube_PadChest',
                                       'preds_CheD_model1', 'preds_CheD_model2', 'preds_CheD_model3',
                                       'preds_NSG_model1', 'preds_NSG_model2', 'preds_NSG_model3',
                                       'preds_Endo_model1', 'preds_Endo_model2', 'preds_Endo_model3',
                                       'preds_Trach_model1', 'preds_Trach_model2', 'preds_Trach_model3',])


    ## From here, one can return the preds_df if you want to see the predictions nicely
    
    if print_auc:
        # Computing the auc for each tube separately
        print('CHEST DRAIN TUBE')
        preds_df_tube = preds_df[preds_df['Chest_drain_Ann'] != -1]
        auc_with_anns = [roc_auc_score(preds_df_tube['Chest_drain_Ann'], preds_df_tube['preds_CheD_model1']), roc_auc_score(preds_df_tube['Chest_drain_Ann'], preds_df_tube['preds_CheD_model2']), roc_auc_score(preds_df_tube['Chest_drain_Ann'], preds_df_tube['preds_CheD_model3'])]
        auc_with_padchest = [roc_auc_score(preds_df_tube['Chest_drain_tube_PadChest'], preds_df_tube['preds_CheD_model1']), roc_auc_score(preds_df_tube['Chest_drain_tube_PadChest'], preds_df_tube['preds_CheD_model2']), roc_auc_score(preds_df_tube['Chest_drain_tube_PadChest'], preds_df_tube['preds_CheD_model3'])]
        print("Annotations Average auc:", round(sum(auc_with_anns)/len(auc_with_anns)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_anns)*100,1))
        print("PadChest Average auc:", round(sum(auc_with_padchest)/len(auc_with_padchest)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_padchest)*100,1))
        #print(auc_with_anns)
        #print(auc_with_padchest)
        print()

        print('NSG TUBE')
        preds_df_tube = preds_df[preds_df['NSG_tube_Ann'] != -1]
        auc_with_anns = [roc_auc_score(preds_df_tube['NSG_tube_Ann'], preds_df_tube['preds_NSG_model1']), roc_auc_score(preds_df_tube['NSG_tube_Ann'], preds_df_tube['preds_NSG_model2']), roc_auc_score(preds_df_tube['NSG_tube_Ann'], preds_df_tube['preds_NSG_model3'])]
        auc_with_padchest = [roc_auc_score(preds_df_tube['NSG_tube_PadChest'], preds_df_tube['preds_NSG_model1']), roc_auc_score(preds_df_tube['NSG_tube_PadChest'], preds_df_tube['preds_NSG_model2']), roc_auc_score(preds_df_tube['NSG_tube_PadChest'], preds_df_tube['preds_NSG_model3'])]
        print("Annotations Average auc:", round(sum(auc_with_anns)/len(auc_with_anns)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_anns)*100,1))
        print("PadChest Average auc:", round(sum(auc_with_padchest)/len(auc_with_padchest)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_padchest)*100,1))
        #print(auc_with_anns)
        #print(auc_with_padchest)
        print()

        print('ENDOTRACHEAL TUBE')
        preds_df_tube = preds_df[preds_df['Endotracheal_tube_Ann'] != -1]
        auc_with_anns = [roc_auc_score(preds_df_tube['Endotracheal_tube_Ann'], preds_df_tube['preds_Endo_model1']), roc_auc_score(preds_df_tube['Endotracheal_tube_Ann'], preds_df_tube['preds_Endo_model2']), roc_auc_score(preds_df_tube['Endotracheal_tube_Ann'], preds_df_tube['preds_Endo_model3'])]
        auc_with_padchest = [roc_auc_score(preds_df_tube['Endotracheal_tube_PadChest'], preds_df_tube['preds_Endo_model1']), roc_auc_score(preds_df_tube['Endotracheal_tube_PadChest'], preds_df_tube['preds_Endo_model2']), roc_auc_score(preds_df_tube['Endotracheal_tube_PadChest'], preds_df_tube['preds_Endo_model3'])]
        print("Annotations Average auc:", round(sum(auc_with_anns)/len(auc_with_anns)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_anns)*100,1))
        print("PadChest Average auc:", round(sum(auc_with_padchest)/len(auc_with_padchest)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_padchest)*100,1))
        #print(auc_with_anns)
        #print(auc_with_padchest)
        print()

        print('TRACHEOSTOMY TUBE')
        preds_df_tube = preds_df[preds_df['Tracheostomy_tube_Ann'] != -1]
        auc_with_anns = [roc_auc_score(preds_df_tube['Tracheostomy_tube_Ann'], preds_df_tube['preds_Trach_model1']), roc_auc_score(preds_df_tube['Tracheostomy_tube_Ann'], preds_df_tube['preds_Trach_model2']), roc_auc_score(preds_df_tube['Tracheostomy_tube_Ann'], preds_df_tube['preds_Trach_model3'])]
        auc_with_padchest = [roc_auc_score(preds_df_tube['Tracheostomy_tube_PadChest'], preds_df_tube['preds_Trach_model1']), roc_auc_score(preds_df_tube['Tracheostomy_tube_PadChest'], preds_df_tube['preds_Trach_model2']), roc_auc_score(preds_df_tube['Tracheostomy_tube_PadChest'], preds_df_tube['preds_Trach_model3'])]
        print("Annotations Average auc:", round(sum(auc_with_anns)/len(auc_with_anns)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_anns)*100,1))
        print("PadChest Average auc:", round(sum(auc_with_padchest)/len(auc_with_padchest)*100, 1), "with standard deviation:", round(statistics.stdev(auc_with_padchest)*100,1))
        #print(auc_with_anns)
        #print(auc_with_padchest)
        print()
    
    return preds_df

### MultiTask: Pathology Detection - DenseNet121 (fine-tuned on PadChest), predictions on PadChest, detecting 5 pathologies

In [21]:
preds_df = get_preds_multiclass_PD(MT_PD, padchest_test_labels_ALL)
#preds_df

Effusion
Average auc: 94.3 with standard deviation: 0.1

Pneumothorax
Average auc: 84.2 with standard deviation: 0.8

Atelectasis
Average auc: 86.8 with standard deviation: 0.3

Cardiomegaly
Average auc: 89.4 with standard deviation: 0.1

Pneumonia
Average auc: 80.1 with standard deviation: 0.5



### MultiTask: Tube Detection - DenseNet121 (fine-tuned on PadChest), predictions on PadChest, detecting 4 tubes

In [53]:
preds_df = get_preds_multiclass_TD(MT_TD, ann_labels)
#preds_df

CHEST DRAIN TUBE
Annotations Average auc: 35.7 with standard deviation: 5.2
PadChest Average auc: 38.9 with standard deviation: 3.8

NSG TUBE
Annotations Average auc: 74.5 with standard deviation: 0.3
PadChest Average auc: 69.8 with standard deviation: 0.9

ENDOTRACHEAL TUBE
Annotations Average auc: 66.2 with standard deviation: 1.6
PadChest Average auc: 66.3 with standard deviation: 1.2

TRACHEOSTOMY TUBE
Annotations Average auc: 56.9 with standard deviation: 1.9
PadChest Average auc: 56.9 with standard deviation: 1.9



## Implementation of Class-Wise Calibration Error (CWCE)

Binary Expected Calibration Error:
$$ \text{binary-ECE}  = \sum_{i=1}^M \frac{|B_{i}|}{N} |
        \bar{y}(B_{i}) - \bar{p}(B_{i})| $$

Class-wise Expected Calibration Error:
$$ \text{class-$j$-ECE}  = \sum_{i=1}^M \frac{|B_{i,j}|}{N}
        |\bar{y}_j(B_{i,j}) - \bar{p}_j(B_{i,j})|,
        \text{classwise-ECE}  = \frac{1}{K}\sum_{j=1}^K \text{class-$j$-ECE} $$

In [24]:
def binary_ECE(y_true, probs, power=1, bins=10):
    r"""
    Binary Expected Calibration Error
    
    Parameters
    ----------
    y_true : indicator vector (n_samples, )
        True labels.
    probs : matrix (n_samples, )
        Predicted probabilities for positive class.
        
    Returns
    -------
    score : float
    """

    create_bins = np.linspace(start=0, stop=1, num=bins + 1)   # Returns 'num' evenly spaced samples, calculated over the interval [start, stop]
    #print('bins created: ', create_bins)
    idx_bins = np.digitize(x=probs, bins=create_bins)   # Return the indices of the bins to which each value in input array belongs
    idx_bins -= 1   # Need to subtract 1 from the bin indices to start at 0
    
    
    # Function for computing the ECE for one bin
    def bin_func(y, p, idx_bins):
        probs_bin_mean = np.mean(p[idx_bins])   # Mean of probs in bin i
        true_bin_mean = np.mean(y[idx_bins])   # Mean of true values in bin i
        diff = np.abs(probs_bin_mean - true_bin_mean)   # Absolute difference between the two bin means
        diff_power = diff ** power   # Raising the diff according to the L_p calibration error specified, typically power = 1
        ece = diff_power * np.sum(idx_bins) / len(p)   # Multiplying by the fraction of probs in that bin
        return ece
        
    # Computing the binary ECE for each bin and summing them
    ece = 0
    
    for i in np.unique(idx_bins):   # Looping through the unique bins (len(bins))
        ece += bin_func(y_true, probs, idx_bins == i)   # Summing the error for each bin

    return ece


def classwise_ECE(y_true, probs, classes_list, power=1, bins=10, print_ece=False):
    r"""Classwise Expected Calibration Error
    
    Parameters
    ----------
    y_true : label indicator matrix (n_samples, n_classes)
        True labels.
    probs : matrix (n_samples, n_classes)
        Predicted probabilities.
        
    Returns
    -------
    score : float
    """

    n_classes = len(classes_list)
    
    # Computing the binary ECE for each class
    class_eces = []
    for c in range(n_classes):   # Looping through the classes
        binary_ece = binary_ECE(y_true[:, c], probs[:, c], power=power, bins=bins)
        if print_ece:
            print('ECE for {}: {}'.format(classes_list[c], round(binary_ece, 3)))
        class_eces.append(binary_ece)
    
    #if print_ece:
        #print()
        #print('Average Class-Wise ECE: ', round(np.mean(class_eces), 3))
    
    return class_eces
    # Right now, not printing the average class-wise calibration error

    
def classwise_ECE_three_models_TD(df_orig, df_y_true, classes_list, power=1, bins=10):
        
    # Creating the preds df
    preds_df = get_preds_multiclass_TD(df_orig, df_y_true, print_auc=False)
    
    all_model_eces_ann = []
    all_model_eces_pad = []
    
    for i in range(3):
        probs_model_df = preds_df[['preds_CheD_model'+str(i+1), 'preds_NSG_model'+str(i+1), 'preds_Endo_model'+str(i+1), 'preds_Trach_model'+str(i+1)]]
        y_true_ann_df = preds_df[['Chest_drain_Ann', 'NSG_tube_Ann', 'Endotracheal_tube_Ann', 'Tracheostomy_tube_Ann']]
        y_true_pad_df = preds_df[['Chest_drain_tube_PadChest', 'NSG_tube_PadChest', 'Endotracheal_tube_PadChest', 'Tracheostomy_tube_PadChest']]
        
        class_eces_ann = classwise_ECE(y_true_ann_df.to_numpy(), probs_model_df.to_numpy(), classes_list=classes_list, power=power, bins=bins)
        all_model_eces_ann.append(class_eces_ann)
        
        class_eces_pad = classwise_ECE(y_true_pad_df.to_numpy(), probs_model_df.to_numpy(), classes_list=classes_list, power=power, bins=bins)
        all_model_eces_pad.append(class_eces_pad)
        
    #print(all_model_eces_ann)
    #print(all_model_eces_pad)
    
    for c_idx, c in enumerate(classes_list):
        print('Class: ', c)
        print('Average CWCE Ann: ', round(sum([all_model_eces_ann[i][c_idx] for i in range(3)]) / 3, 5), 'with standard deviation: ', round(statistics.stdev([all_model_eces_ann[i][c_idx] for i in range(3)]), 5))
        print('Average CWCE Pad: ', round(sum([all_model_eces_pad[i][c_idx] for i in range(3)]) / 3, 5), 'with standard deviation: ', round(statistics.stdev([all_model_eces_pad[i][c_idx] for i in range(3)]), 5))
        print()
    
    return preds_df
    # Right now, not printing the average class-wise calibration error

    
def classwise_ECE_three_models_PD(df_orig, df_y_true, classes_list, power=1, bins=10):
        
    # Creating the preds df
    preds_df = get_preds_multiclass_PD(df_orig, df_y_true, print_auc=False)
    
    all_model_eces = []
    
    for i in range(3):
        probs_model_df = preds_df[['preds_Effusion_model'+str(i+1), 'preds_Pneumothorax_model'+str(i+1), 'preds_Atelectasis_model'+str(i+1), 'preds_Cardiomegaly_model'+str(i+1), 'preds_Pneumonia_model'+str(i+1)]]
        y_true_ann_df = preds_df[['Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']]
        
        class_eces = classwise_ECE(y_true_ann_df.to_numpy(), probs_model_df.to_numpy(), classes_list=classes_list, power=power, bins=bins)
        all_model_eces.append(class_eces)
        
    #print(all_model_eces)
    
    for c_idx, c in enumerate(classes_list):
        print('Class: ', c)
        print('Average CWCE: ', round(sum([all_model_eces[i][c_idx] for i in range(3)]) / 3, 5), 'with standard deviation: ', round(statistics.stdev([all_model_eces[i][c_idx] for i in range(3)]), 5))
        print()
    
    return preds_df
    # Right now, not printing the average class-wise calibration error
        
    

### MultiTask: Pathology Detection - DenseNet121 (fine-tuned on PadChest), predictions on PadChest, detecting 5 pathologies

In [25]:
pathologies = ['Effusion', 'Pneumothorax', 'Atelectasis', 'Cardiomegaly', 'Pneumonia']
preds_df = classwise_ECE_three_models_PD(MT_PD, padchest_test_labels_ALL, classes_list=pathologies, power=1, bins=10)

Class:  Effusion
Average CWCE:  0.00808 with standard deviation:  0.00057

Class:  Pneumothorax
Average CWCE:  0.00018 with standard deviation:  0.00018

Class:  Atelectasis
Average CWCE:  0.0025 with standard deviation:  2e-05

Class:  Cardiomegaly
Average CWCE:  0.00633 with standard deviation:  0.00037

Class:  Pneumonia
Average CWCE:  0.00399 with standard deviation:  0.00123



### MultiTask: Tube Detection - DenseNet121 (fine-tuned on PadChest), predictions on PadChest, detecting 4 tubes

In [27]:
tube_types = ['Chest_drain_tube', 'NSG_tube', 'Endotracheal_tube', 'Tracheostomy_tube']
preds_df = classwise_ECE_three_models_TD(MT_TD, ann_labels, classes_list=tube_types, power=1, bins=10)

Class:  Chest_drain_tube
Average CWCE Ann:  0.01001 with standard deviation:  0.0019
Average CWCE Pad:  0.05188 with standard deviation:  0.00113

Class:  NSG_tube
Average CWCE Ann:  0.21601 with standard deviation:  0.00694
Average CWCE Pad:  0.3222 with standard deviation:  0.00373

Class:  Endotracheal_tube
Average CWCE Ann:  0.17787 with standard deviation:  0.00518
Average CWCE Pad:  0.1839 with standard deviation:  0.00642

Class:  Tracheostomy_tube
Average CWCE Ann:  0.17213 with standard deviation:  0.00792
Average CWCE Pad:  0.15868 with standard deviation:  0.00299

