In [78]:
import os
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

class ParticipantVisibleError(Exception):
    pass

def normalize_probabilities_to_one(df: pd.DataFrame, group_columns: list) -> pd.DataFrame:
    # Normalize the sum of each row's probabilities to 100%.
    # 0.75, 0.75 => 0.5, 0.5
    # 0.1, 0.1 => 0.5, 0.5
    row_totals = df[group_columns].sum(axis=1)
    
    if row_totals.min() == 0:
        raise ParticipantVisibleError('All rows must contain at least one non-zero prediction')
    for col in group_columns:
        df[col] /= row_totals
    
    return df


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    '''
    Pseudocode:
    1. For every label group (liver, bowel, etc):
        - Normalize the sum of each row's probabilities to 100%.
        - Calculate the sample weighted log loss.
    2. Derive a new any_injury label by taking the max of 1 - p(healthy) for each label group
    3. Calculate the sample weighted log loss for the new label group
    4. Return the average of all of the label group log losses as the final score.
    '''
    # del solution[row_id_column_name]
    # del submission[row_id_column_name]
    solution = solution.drop(columns=row_id_column_name)
    submission = submission.drop(columns=row_id_column_name)

    # Run basic QC checks on the inputs
    if not pd.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('All submission values must be numeric')

    if not np.isfinite(submission.values).all():
        raise ParticipantVisibleError('All submission values must be finite')

    if solution.min().min() < 0:
        raise ParticipantVisibleError('All labels must be at least zero')
    if submission.min().min() < 0:
        raise ParticipantVisibleError('All predictions must be at least zero')

    # Calculate the label group log losses
    binary_targets = ['bowel', 'extravasation']
    triple_level_targets = ['kidney', 'liver', 'spleen']
    all_target_categories = binary_targets + triple_level_targets

    label_group_losses = []
    for category in all_target_categories:
        if category in binary_targets:
            col_group = [f'{category}_healthy', f'{category}_injury']
        else:
            col_group = [f'{category}_healthy', f'{category}_low', f'{category}_high']

        solution = normalize_probabilities_to_one(solution, col_group)

        for col in col_group:
            if col not in submission.columns:
                raise ParticipantVisibleError(f'Missing submission column {col}')
        
        # display(submission[col_group].head(2))
        
        submission = normalize_probabilities_to_one(submission, col_group)
        
        # display(submission[col_group].head(2))
        
        label_group_losses.append(
            log_loss(
                y_true=solution[col_group].values,
                y_pred=submission[col_group].values,
                sample_weight=solution[f'{category}_weight'].values
            )
        )
        
    # Derive a new any_injury label by taking the max of 1 - p(healthy) for each label group
    healthy_cols = [x + '_healthy' for x in all_target_categories]
    any_injury_labels = (1 - solution[healthy_cols]).max(axis=1)
    any_injury_predictions = (1 - submission[healthy_cols]).max(axis=1)
    any_injury_loss = sklearn.metrics.log_loss(
        y_true=any_injury_labels.values,
        y_pred=any_injury_predictions.values,
        sample_weight=solution['any_injury_weight'].values
    )


    for cat, loss in zip(all_target_categories, label_group_losses):
        print(cat, f"{loss:.6f}")
    print('any_injury', f"{any_injury_loss:.6f}")

    label_group_losses.append(any_injury_loss)
    print(f"total : {np.mean(label_group_losses):.6f}")
    return np.mean(label_group_losses)


In [79]:
sol = pd.read_csv("solution.csv")
sub = pd.read_csv("submission.csv")

In [80]:
solution = sol.groupby('patient_id').first().drop('psid', axis=1).reset_index()
solution

Unnamed: 0,patient_id,series_id,bowel_healthy,bowel_injury,extravasation_healthy,extravasation_injury,kidney_healthy,kidney_low,kidney_high,liver_healthy,...,spleen_low,spleen_high,any_injury,fold,liver_weight,spleen_weight,kidney_weight,bowel_weight,extravasation_weight,any_injury_weight
0,19,14374,1,0,1,0,1,0,0,1,...,0,0,0,4,1,1,1,1,1,1
1,26,18881,1,0,1,0,1,0,0,1,...,0,0,0,0,1,1,1,1,1,1
2,33,55570,0,1,0,1,1,0,0,0,...,0,0,1,2,2,1,1,2,6,6
3,43,24055,0,1,0,1,1,0,0,1,...,0,0,1,0,1,1,1,2,6,6
4,96,39874,1,0,1,0,1,0,0,1,...,0,0,0,2,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,65450,9948,1,0,1,0,1,0,0,1,...,0,0,0,4,1,1,1,1,1,1
3143,65456,40781,0,1,1,0,1,0,0,1,...,0,1,1,3,1,4,1,2,1,6
3144,65495,8371,1,0,1,0,1,0,0,1,...,0,0,0,1,1,1,1,1,1,1
3145,65504,50212,1,0,1,0,1,0,0,0,...,0,1,1,0,2,4,1,1,1,6


In [81]:
healthy_cols = [
    'liver_healthy', 'spleen_healthy', 'kidney_healthy', 
    'bowel_healthy', 'extravasation_healthy'
]

submission = sub.groupby('patient_id').mean()
submission['any_injury'] = (1 - submission[healthy_cols]).max(axis=1)
submission = submission.reset_index()
submission

Unnamed: 0,patient_id,fold,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,kidney_healthy,kidney_low,kidney_high,bowel_injury,bowel_healthy,extravasation_injury,extravasation_healthy,any_injury
0,19,4.0,0.966388,0.026473,0.007139,0.976135,0.017820,0.006045,0.978393,0.015858,0.005749,0.019324,0.980676,0.176882,0.823118,0.176882
1,26,0.0,0.990979,0.007324,0.001697,0.968159,0.025637,0.006204,0.992674,0.005147,0.002179,0.057416,0.942584,0.283036,0.716964,0.283036
2,33,2.0,0.911501,0.062628,0.025871,0.355821,0.414410,0.229768,0.984599,0.013334,0.002066,0.304534,0.695466,0.496448,0.503552,0.644179
3,43,0.0,0.915427,0.072607,0.011966,0.822994,0.167545,0.009461,0.901119,0.083197,0.015684,0.122487,0.877513,0.165509,0.834491,0.177006
4,96,2.0,0.992411,0.006553,0.001035,0.933058,0.044775,0.022167,0.986850,0.010217,0.002934,0.011404,0.988596,0.120296,0.879704,0.120296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,65450,4.0,0.970890,0.024880,0.004230,0.911804,0.082456,0.005740,0.976484,0.018655,0.004861,0.175026,0.824974,0.240423,0.759577,0.240423
3143,65456,3.0,0.801631,0.193559,0.004809,0.029832,0.133929,0.836239,0.939663,0.056155,0.004182,0.036975,0.963025,0.360245,0.639755,0.970168
3144,65495,1.0,0.958963,0.038517,0.002520,0.961368,0.025083,0.013549,0.968504,0.022437,0.009058,0.014076,0.985924,0.165359,0.834641,0.165359
3145,65504,0.0,0.848144,0.136108,0.015748,0.025510,0.089796,0.884694,0.972877,0.017939,0.009184,0.021890,0.978110,0.496255,0.503745,0.974490


In [82]:
submission[submission['patient_id']==10004]

Unnamed: 0,patient_id,fold,liver_healthy,liver_low,liver_high,spleen_healthy,spleen_low,spleen_high,kidney_healthy,kidney_low,kidney_high,bowel_injury,bowel_healthy,extravasation_injury,extravasation_healthy,any_injury
459,10004,3.0,0.961049,0.03655,0.002401,0.696147,0.259332,0.044522,0.910546,0.068165,0.021289,0.007051,0.992949,0.502249,0.497751,0.502249


In [83]:
score(solution, submission, 'patient_id')

bowel 0.135517
extravasation 0.546069
kidney 0.238230
liver 0.403472
spleen 0.403839
any_injury 0.512198
total : 0.373221


0.37322077187029473

In [84]:
# efficientnetv2s
# bowel 0.140992
# extravasation 0.546069
# kidney 0.249054
# liver 0.414224
# spleen 0.422724
# any_injury 0.508193
# total : 0.380209
# 0.3802091922992829

In [None]:
# ensemble model
# bowel 0.135517
# extravasation 0.546069
# kidney 0.238230
# liver 0.403472
# spleen 0.403839
# any_injury 0.512198
# total : 0.373221
# 0.37322077187029473