In [1]:
import numpy as np
import pandas as pd
import os
import yaml

#### Paths & Vars

In [2]:
# vars
adapt_methods = ['coral','al_layer']
gen_methods = ['irm','dro','coral','al_layer']
tasks = ['mortality','longlos','invasivevent','sepsis']
metrics = ['auc','auprc','ace_abs_logistic_log']
Ns = [100,500,1000,1500]

# paths
tables_fpath = '/hpf/projects/lsung/projects/mimic4ds/Experiments/domain_adapt/tables/'
adapt_fpath = '/hpf/projects/lsung/projects/mimic4ds/Experiments/domain_adapt'
gen_fpath = '/hpf/projects/lsung/projects/mimic4ds/Experiments/domain_gen'
base_fpath = '/hpf/projects/lsung/projects/mimic4ds/Experiments/baseline'

### Collect OOD Results

In [3]:
alpha = 0.05

results = {
    "base":{
        "eval":pd.read_csv(f"{base_fpath}/results/model_evaluation_{alpha}.csv").query("phase=='test'"),
        "compare":pd.read_csv(f"{base_fpath}/results/model_comparison_{alpha}.csv").query("phase=='test'"),
    },
    "gen":{
        "eval":pd.read_csv(f"{gen_fpath}/results/model_evaluation_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==1"),
        "compare":pd.read_csv(f"{gen_fpath}/results/model_comparison_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==1"),
    },
    "adapt":{
        "eval":pd.read_csv(f"{adapt_fpath}/results/model_evaluation_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==1"),
        "compare":pd.read_csv(f"{adapt_fpath}/results/model_comparison_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==1"),
    },
}

df_results_all = {}
for eval_method in ['avg', 'ensemble', 'best']:
    # base results
    df_base = results['base']['eval'].query(
        "\
        train_group=='2008 - 2010' and\
        eval_group=='2017 - 2019' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper']]
    df_base['group']=1
    df_base['train_method']='Baseline [08-10]'

    # oracle results
    df_oracle = results['base']['eval'].query(
        "\
        train_group=='2017 - 2019' and\
        eval_group=='2017 - 2019' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper']]
    df_oracle['group']=1
    df_oracle['train_method'] = 'Oracle [17-19]'

    # ERM results
    df_erm = results['gen']['eval'].query(
        "\
        train_method=='erm' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method']]

    # domain gen results
    df_gen = results['gen']['eval'].query(
        "\
        train_method!='erm' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method']]
    df_gen['framework'] = 'Domain Generalization'

    stats = results['gen']['compare'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_upper','group','train_method']]
    stats['sig'] = stats['ci_lower']*stats['ci_upper']>0
    stats.drop(columns=['ci_lower','ci_upper'],inplace=True)

    df_gen = pd.merge(
        df_gen, 
        stats, 
        how='left', 
        left_on=['analysis_id','metric','group', 'train_method'],
        right_on = ['analysis_id','metric','group','train_method']
    )

    # domain adapt results
    df_adapt = results['adapt']['eval'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method','n_ood']]
    df_adapt['framework'] = 'Domain Adaptation'
    
    stats = results['adapt']['compare'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_upper','group','train_method','n_ood']]
    stats['sig'] = stats['ci_lower']*stats['ci_upper']>0
    stats.drop(columns=['ci_lower','ci_upper'],inplace=True)

    df_adapt = pd.merge(
        df_adapt, 
        stats, 
        how='left', 
        left_on=['analysis_id','metric','group', 'train_method','n_ood'],
        right_on = ['analysis_id','metric','group','train_method','n_ood']
    )
    
    # join results
    df_results = pd.concat((df_base, df_oracle, df_erm, df_gen, df_adapt))
    df_results['sig'].replace({
        False:'',
        True:'*',
        np.nan:''
    }, inplace=True)

    # combine CI columns into performance column
    df_results['Performance'] = (
        df_results['ci_med'].apply('{:.3f}'.format) + 
        df_results['sig'] +
        " (" + 
        df_results['ci_lower'].apply('{:.3f}'.format) + 
        ',' + 
        df_results['ci_upper'].apply('{:.3f}'.format) +
        ')'
    )
    df_results.drop(columns=['ci_lower','ci_med','ci_upper'],inplace=True)
    df_results.fillna(" ",inplace=True)

    # rename columns and values
    df_results.rename(columns = {
        'metric':'Metric',
        'group':'Year Group',
        'analysis_id':'Task',
        'n_ood':'Unlabeled OOD Samples',
        'framework': 'Framework',
        'train_method':'Method'},inplace=True)

    # Replace values
    df_results['Year Group'].replace(
        {
            0:"2008 - 2016 [ID]",
            1:"2017 - 2019 [OOD]"
        },
        inplace=True
    )
    df_results['Metric'].replace(
        {
            'auc':"AUROC",
            'auprc':"AUPRC",
            'ace_abs_logistic_log':'Calibration'
        },
        inplace=True
    )
    df_results['Method'].replace(
        {
            'al_layer':"AL",
            'coral':"CORAL",
            'erm':"ERM",
            'irm':'IRM',
            'dro':'GroupDRO'
        },
        inplace=True
    )
    df_results['Task'].replace(
        {
            'longlos':'Long LOS',
            'sepsis':'Sepsis',
            'mortality':'Mortality',
            'invasivevent':'Invasive Ventilation',
        },
        inplace=True
    )

    # pivot table    
    df_results = df_results.pivot(index=["Task","Year Group","Metric"],columns=["Framework","Unlabeled OOD Samples","Method"],values=["Performance"])
    df_results.fillna(" ",inplace = True)
    df_results.columns = pd.MultiIndex.from_tuples([x[1:] for x in df_results.columns], names = ['Framework','Unlabelled OOD Samples','Method'])
    
    # Order Indices & Columns
    df_results = df_results[[
        (                    ' ',    ' ', 'Baseline [08-10]'),
        (                    ' ',    ' ',   'Oracle [17-19]'),
        (                    ' ',    ' ',              'ERM'),
        ('Domain Generalization',    ' ',              'IRM'),
        ('Domain Generalization',    ' ',         'GroupDRO'),
        ('Domain Generalization',    ' ',               'AL'),
        ('Domain Generalization',    ' ',            'CORAL'),
        (    'Domain Adaptation',  100.0,               'AL'),
        (    'Domain Adaptation',  100.0,            'CORAL'),
        (    'Domain Adaptation',  500.0,               'AL'),
        (    'Domain Adaptation',  500.0,            'CORAL'),
        (    'Domain Adaptation', 1000.0,               'AL'),
        (    'Domain Adaptation', 1000.0,            'CORAL'),
        (    'Domain Adaptation', 1500.0,               'AL'),
        (    'Domain Adaptation', 1500.0,            'CORAL'),
    ]]
    
    df_results = df_results.reindex(labels = ['Long LOS','Sepsis','Mortality','Invasive Ventilation'], level=0)
    df_results = df_results.reindex(labels = ['AUROC','AUPRC','Calibration'],level=2)
    #df_results = df_results.style.apply(highlight_sig, axis=0)
    
    # add to dictionary
    df_results_all[eval_method] = df_results

### Comparison of Average Model Performance

In [4]:
df_results_all['avg']

Unnamed: 0_level_0,Unnamed: 1_level_0,Framework,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Domain Generalization,Domain Generalization,Domain Generalization,Domain Generalization,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unlabelled OOD Samples,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,100.0,100.0,500.0,500.0,1000.0,1000.0,1500.0,1500.0
Unnamed: 0_level_2,Unnamed: 1_level_2,Method,Baseline [08-10],Oracle [17-19],ERM,IRM,GroupDRO,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL
Task,Year Group,Metric,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
Long LOS,2017 - 2019 [OOD],AUROC,"0.644 (0.637,0.650)","0.702 (0.696,0.708)","0.678 (0.671,0.684)","0.678 (0.671,0.684)","0.679 (0.673,0.685)","0.678 (0.672,0.685)","0.678 (0.671,0.684)","0.679 (0.672,0.685)","0.679 (0.672,0.685)","0.678 (0.672,0.684)","0.680* (0.674,0.687)","0.679 (0.672,0.685)","0.680* (0.673,0.686)","0.679 (0.672,0.685)","0.679 (0.672,0.685)"
Long LOS,2017 - 2019 [OOD],AUPRC,"0.512 (0.501,0.522)","0.542 (0.532,0.552)","0.533 (0.522,0.543)","0.536* (0.525,0.546)","0.534 (0.524,0.544)","0.535 (0.525,0.545)","0.535 (0.525,0.545)","0.535 (0.525,0.545)","0.535 (0.525,0.545)","0.534 (0.524,0.544)","0.536* (0.526,0.546)","0.535 (0.524,0.545)","0.536* (0.526,0.547)","0.534 (0.524,0.544)","0.534 (0.524,0.544)"
Long LOS,2017 - 2019 [OOD],Calibration,"0.089 (0.084,0.093)","0.032 (0.029,0.036)","0.061 (0.057,0.065)","0.054* (0.050,0.058)","0.076* (0.072,0.081)","0.060 (0.056,0.064)","0.061 (0.057,0.065)","0.057* (0.053,0.062)","0.059* (0.055,0.063)","0.058* (0.053,0.062)","0.057* (0.053,0.062)","0.059* (0.054,0.063)","0.060 (0.056,0.064)","0.060 (0.056,0.064)","0.064* (0.060,0.069)"
Sepsis,2017 - 2019 [OOD],AUROC,"0.645 (0.634,0.656)","0.735 (0.724,0.746)","0.694 (0.683,0.705)","0.690* (0.680,0.702)","0.664* (0.653,0.675)","0.694 (0.683,0.705)","0.691* (0.680,0.702)","0.692 (0.681,0.703)","0.693 (0.682,0.704)","0.692 (0.681,0.703)","0.692 (0.682,0.703)","0.694 (0.683,0.705)","0.692 (0.681,0.703)","0.694 (0.683,0.705)","0.691* (0.680,0.702)"
Sepsis,2017 - 2019 [OOD],AUPRC,"0.155 (0.148,0.164)","0.298 (0.280,0.317)","0.195 (0.184,0.206)","0.193 (0.182,0.204)","0.169* (0.160,0.178)","0.193 (0.182,0.204)","0.192 (0.181,0.203)","0.191* (0.181,0.202)","0.193 (0.182,0.204)","0.193 (0.182,0.204)","0.193 (0.183,0.205)","0.194 (0.183,0.205)","0.193 (0.182,0.203)","0.195 (0.184,0.206)","0.191* (0.180,0.202)"
Sepsis,2017 - 2019 [OOD],Calibration,"0.065 (0.061,0.068)","0.013 (0.011,0.016)","0.031 (0.028,0.034)","0.032 (0.028,0.035)","0.042* (0.039,0.046)","0.030 (0.027,0.034)","0.030* (0.026,0.033)","0.033* (0.029,0.036)","0.033* (0.030,0.036)","0.034* (0.030,0.037)","0.032 (0.029,0.035)","0.033* (0.030,0.036)","0.032 (0.028,0.035)","0.031 (0.028,0.035)","0.033* (0.030,0.037)"
Mortality,2017 - 2019 [OOD],AUROC,"0.877 (0.869,0.885)","0.903 (0.897,0.909)","0.894 (0.887,0.901)","0.895 (0.888,0.902)","0.845* (0.835,0.854)","0.894 (0.886,0.900)","0.893 (0.886,0.900)","0.893 (0.886,0.900)","0.893 (0.885,0.900)","0.893 (0.886,0.900)","0.894 (0.887,0.901)","0.893 (0.885,0.899)","0.896 (0.889,0.903)","0.893 (0.886,0.900)","0.893 (0.886,0.900)"
Mortality,2017 - 2019 [OOD],AUPRC,"0.464 (0.443,0.486)","0.494 (0.472,0.515)","0.541 (0.520,0.562)","0.543 (0.522,0.563)","0.499* (0.478,0.520)","0.542 (0.521,0.562)","0.541 (0.520,0.562)","0.539 (0.517,0.559)","0.538 (0.516,0.558)","0.543 (0.522,0.563)","0.542 (0.522,0.563)","0.539 (0.517,0.559)","0.545 (0.524,0.566)","0.541 (0.519,0.561)","0.543 (0.522,0.563)"
Mortality,2017 - 2019 [OOD],Calibration,"0.014 (0.013,0.016)","0.014 (0.012,0.015)","0.018 (0.016,0.020)","0.017* (0.015,0.019)","0.043* (0.041,0.046)","0.018 (0.016,0.020)","0.018 (0.016,0.020)","0.018* (0.016,0.020)","0.015* (0.013,0.017)","0.018 (0.016,0.020)","0.017* (0.015,0.019)","0.017 (0.016,0.020)","0.016* (0.014,0.018)","0.017 (0.015,0.019)","0.016* (0.014,0.018)"
Invasive Ventilation,2017 - 2019 [OOD],AUROC,"0.878 (0.870,0.886)","0.880 (0.872,0.888)","0.885 (0.877,0.893)","0.885 (0.877,0.893)","0.884 (0.876,0.892)","0.884 (0.876,0.892)","0.885 (0.877,0.893)","0.885 (0.876,0.892)","0.885 (0.877,0.893)","0.884 (0.876,0.892)","0.887 (0.878,0.894)","0.885 (0.877,0.893)","0.887 (0.879,0.895)","0.881* (0.873,0.889)","0.884 (0.875,0.892)"


In [6]:
df_results_all['best']

Unnamed: 0_level_0,Unnamed: 1_level_0,Framework,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Domain Generalization,Domain Generalization,Domain Generalization,Domain Generalization,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unlabelled OOD Samples,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,100.0,100.0,500.0,500.0,1000.0,1000.0,1500.0,1500.0
Unnamed: 0_level_2,Unnamed: 1_level_2,Method,Baseline [08-10],Oracle [17-19],ERM,IRM,GroupDRO,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL
Task,Year Group,Metric,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3
Long LOS,2017 - 2019 [OOD],AUROC,"0.650 (0.621,0.679)","0.701 (0.673,0.727)","0.676 (0.648,0.704)","0.681 (0.653,0.709)","0.677 (0.648,0.705)","0.677 (0.649,0.705)","0.684 (0.656,0.712)","0.675 (0.646,0.703)","0.675 (0.646,0.703)","0.685 (0.657,0.713)","0.681 (0.653,0.709)","0.679 (0.651,0.707)","0.674 (0.646,0.702)","0.682 (0.654,0.710)","0.680 (0.651,0.708)"
Long LOS,2017 - 2019 [OOD],AUPRC,"0.513 (0.467,0.559)","0.539 (0.494,0.584)","0.531 (0.486,0.577)","0.535 (0.490,0.581)","0.530 (0.484,0.576)","0.538 (0.494,0.584)","0.539 (0.495,0.585)","0.534 (0.489,0.579)","0.535 (0.492,0.581)","0.542 (0.497,0.587)","0.536 (0.491,0.582)","0.537 (0.492,0.584)","0.532 (0.487,0.578)","0.541 (0.496,0.586)","0.536 (0.491,0.582)"
Long LOS,2017 - 2019 [OOD],Calibration,"0.074 (0.054,0.093)","0.023 (0.017,0.039)","0.057 (0.038,0.077)","0.060 (0.041,0.079)","0.073* (0.054,0.093)","0.069 (0.050,0.089)","0.071* (0.052,0.091)","0.053 (0.034,0.072)","0.054 (0.034,0.073)","0.063 (0.043,0.083)","0.057 (0.037,0.076)","0.050 (0.031,0.069)","0.054 (0.034,0.073)","0.057 (0.037,0.076)","0.058 (0.038,0.077)"
Sepsis,2017 - 2019 [OOD],AUROC,"0.639 (0.588,0.689)","0.754 (0.705,0.800)","0.701 (0.650,0.749)","0.690 (0.639,0.738)","0.664 (0.613,0.714)","0.692 (0.640,0.742)","0.697 (0.646,0.745)","0.685 (0.635,0.733)","0.690 (0.639,0.739)","0.696 (0.646,0.744)","0.691 (0.639,0.740)","0.696 (0.645,0.745)","0.695 (0.644,0.745)","0.698 (0.647,0.746)","0.703 (0.654,0.751)"
Sepsis,2017 - 2019 [OOD],AUPRC,"0.154 (0.120,0.194)","0.308 (0.229,0.392)","0.201 (0.155,0.256)","0.192 (0.148,0.245)","0.171 (0.132,0.218)","0.200 (0.152,0.258)","0.203 (0.154,0.263)","0.179 (0.139,0.226)","0.188 (0.145,0.238)","0.196 (0.150,0.250)","0.194 (0.150,0.248)","0.192 (0.148,0.242)","0.197 (0.152,0.253)","0.189 (0.147,0.238)","0.194 (0.150,0.244)"
Sepsis,2017 - 2019 [OOD],Calibration,"0.063 (0.048,0.079)","0.012 (0.005,0.027)","0.034 (0.019,0.050)","0.030 (0.016,0.045)","0.040 (0.025,0.055)","0.030 (0.015,0.045)","0.029 (0.015,0.044)","0.039 (0.024,0.054)","0.030 (0.015,0.045)","0.034 (0.019,0.049)","0.037 (0.022,0.053)","0.031 (0.017,0.046)","0.027 (0.013,0.043)","0.033 (0.018,0.049)","0.032 (0.017,0.047)"
Mortality,2017 - 2019 [OOD],AUROC,"0.874 (0.835,0.907)","0.904 (0.874,0.929)","0.895 (0.861,0.924)","0.893 (0.859,0.923)","0.858* (0.816,0.895)","0.895 (0.861,0.925)","0.897 (0.865,0.926)","0.895 (0.862,0.925)","0.901 (0.869,0.929)","0.894 (0.861,0.923)","0.894 (0.860,0.923)","0.895 (0.863,0.923)","0.895 (0.862,0.924)","0.894 (0.861,0.924)","0.898 (0.866,0.926)"
Mortality,2017 - 2019 [OOD],AUPRC,"0.456 (0.359,0.548)","0.490 (0.391,0.584)","0.539 (0.440,0.631)","0.552 (0.454,0.640)","0.512 (0.416,0.600)","0.551 (0.453,0.640)","0.555 (0.455,0.645)","0.546 (0.446,0.636)","0.532 (0.431,0.628)","0.548 (0.450,0.638)","0.540 (0.441,0.630)","0.532 (0.430,0.623)","0.544 (0.443,0.637)","0.549 (0.450,0.639)","0.530 (0.431,0.621)"
Mortality,2017 - 2019 [OOD],Calibration,"0.014 (0.008,0.024)","0.015 (0.009,0.024)","0.018 (0.010,0.028)","0.016 (0.009,0.025)","0.035* (0.024,0.046)","0.020 (0.011,0.030)","0.016 (0.009,0.025)","0.018 (0.010,0.028)","0.013 (0.008,0.022)","0.015 (0.008,0.025)","0.019 (0.011,0.029)","0.013 (0.008,0.022)","0.014 (0.008,0.023)","0.018 (0.010,0.028)","0.015 (0.008,0.024)"
Invasive Ventilation,2017 - 2019 [OOD],AUROC,"0.890 (0.854,0.922)","0.878 (0.840,0.911)","0.885 (0.847,0.918)","0.891 (0.854,0.923)","0.889 (0.849,0.923)","0.886 (0.848,0.919)","0.886 (0.849,0.920)","0.882 (0.844,0.916)","0.881 (0.842,0.916)","0.884 (0.845,0.919)","0.883 (0.844,0.918)","0.895 (0.859,0.927)","0.882 (0.842,0.917)","0.887 (0.849,0.921)","0.885 (0.848,0.919)"


## Collect ID Results

In [8]:
results = {
    "gen":{
        "eval":pd.read_csv(f"{gen_fpath}/results/model_evaluation_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==0"),
        "compare":pd.read_csv(f"{gen_fpath}/results/model_comparison_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==0"),
    },
    "adapt":{
        "eval":pd.read_csv(f"{adapt_fpath}/results/model_evaluation_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==0"),
        "compare":pd.read_csv(f"{adapt_fpath}/results/model_comparison_{alpha}.csv").query("phase=='test' and `lambda`==-1 and group==0"),
    },
}

df_results_all = {}
for eval_method in ['avg', 'ensemble', 'best']:
    # ERM results
    df_erm = results['gen']['eval'].query(
        "\
        train_method=='erm' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method']]

    # domain gen results
    df_gen = results['gen']['eval'].query(
        "\
        train_method!='erm' and\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method']]
    df_gen['framework'] = 'Domain Generalization'

    stats = results['gen']['compare'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_upper','group','train_method']]
    stats['sig'] = stats['ci_lower']*stats['ci_upper']>0
    stats.drop(columns=['ci_lower','ci_upper'],inplace=True)

    df_gen = pd.merge(
        df_gen, 
        stats, 
        how='left', 
        left_on=['analysis_id','metric','group', 'train_method'],
        right_on = ['analysis_id','metric','group','train_method']
    )

    # domain adapt results
    df_adapt = results['adapt']['eval'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_med','ci_upper','group','train_method','n_ood']]
    df_adapt['framework'] = 'Domain Adaptation'
    
    stats = results['adapt']['compare'].query(
        "\
        evaluation_method==@eval_method and\
        metric==@metrics\
        "
    )[['analysis_id', 'metric', 'ci_lower','ci_upper','group','train_method','n_ood']]
    stats['sig'] = stats['ci_lower']*stats['ci_upper']>0
    stats.drop(columns=['ci_lower','ci_upper'],inplace=True)

    df_adapt = pd.merge(
        df_adapt, 
        stats, 
        how='left', 
        left_on=['analysis_id','metric','group', 'train_method','n_ood'],
        right_on = ['analysis_id','metric','group','train_method','n_ood']
    )
    
    # join results
    df_results = pd.concat((df_erm, df_gen, df_adapt))
    df_results['sig'].replace({
        False:'',
        True:'*',
        np.nan:''
    }, inplace=True)

    # combine CI columns into performance column
    df_results['Performance'] = (
        df_results['ci_med'].apply('{:.3f}'.format) + 
        df_results['sig'] +
        " (" + 
        df_results['ci_lower'].apply('{:.3f}'.format) + 
        ',' + 
        df_results['ci_upper'].apply('{:.3f}'.format) +
        ')'
    )
    df_results.drop(columns=['ci_lower','ci_med','ci_upper'],inplace=True)
    df_results.fillna(" ",inplace=True)

    # rename columns and values
    df_results.rename(columns = {
        'metric':'Metric',
        'group':'Year Group',
        'analysis_id':'Task',
        'n_ood':'Unlabeled OOD Samples',
        'framework': 'Framework',
        'train_method':'Method'},inplace=True)

    # Replace values
    df_results['Year Group'].replace(
        {
            0:"2008 - 2016 [ID]",
            1:"2017 - 2019 [OOD]"
        },
        inplace=True
    )
    df_results['Metric'].replace(
        {
            'auc':"AUROC",
            'auprc':"AUPRC",
            'ace_abs_logistic_log':'Calibration'
        },
        inplace=True
    )
    df_results['Method'].replace(
        {
            'al_layer':"AL",
            'coral':"CORAL",
            'erm':"ERM",
            'irm':'IRM',
            'dro':'GroupDRO'
        },
        inplace=True
    )
    df_results['Task'].replace(
        {
            'longlos':'Long LOS',
            'sepsis':'Sepsis',
            'mortality':'Mortality',
            'invasivevent':'Invasive Ventilation',
        },
        inplace=True
    )

    # pivot table    
    df_results = df_results.pivot(index=["Task","Year Group","Metric"],columns=["Framework","Unlabeled OOD Samples","Method"],values=["Performance"])
    df_results.fillna(" ",inplace = True)
    df_results.columns = pd.MultiIndex.from_tuples([x[1:] for x in df_results.columns], names = ['Framework','Unlabelled OOD Samples','Method'])
    
    # Order Indices & Columns
    df_results = df_results[[
        (                    ' ',    ' ',              'ERM'),
        ('Domain Generalization',    ' ',              'IRM'),
        ('Domain Generalization',    ' ',         'GroupDRO'),
        ('Domain Generalization',    ' ',               'AL'),
        ('Domain Generalization',    ' ',            'CORAL'),
        (    'Domain Adaptation',  100.0,               'AL'),
        (    'Domain Adaptation',  100.0,            'CORAL'),
        (    'Domain Adaptation',  500.0,               'AL'),
        (    'Domain Adaptation',  500.0,            'CORAL'),
        (    'Domain Adaptation', 1000.0,               'AL'),
        (    'Domain Adaptation', 1000.0,            'CORAL'),
        (    'Domain Adaptation', 1500.0,               'AL'),
        (    'Domain Adaptation', 1500.0,            'CORAL'),
    ]]
    
    df_results = df_results.reindex(labels = ['Long LOS','Sepsis','Mortality','Invasive Ventilation'], level=0)
    df_results = df_results.reindex(labels = ['AUROC','AUPRC','Calibration'],level=2)
    #df_results = df_results.style.apply(highlight_sig, axis=0)
    
    # add to dictionary
    df_results_all[eval_method] = df_results

In [9]:
df_results_all['avg']

Unnamed: 0_level_0,Unnamed: 1_level_0,Framework,Unnamed: 3_level_0,Domain Generalization,Domain Generalization,Domain Generalization,Domain Generalization,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unlabelled OOD Samples,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,100.0,100.0,500.0,500.0,1000.0,1000.0,1500.0,1500.0
Unnamed: 0_level_2,Unnamed: 1_level_2,Method,ERM,IRM,GroupDRO,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL
Task,Year Group,Metric,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Long LOS,2008 - 2016 [ID],AUROC,"0.744 (0.741,0.748)","0.743* (0.740,0.747)","0.741* (0.738,0.745)","0.744 (0.740,0.747)","0.744 (0.740,0.747)","0.744 (0.741,0.748)","0.744 (0.740,0.747)","0.744 (0.740,0.747)","0.745 (0.741,0.748)","0.744 (0.741,0.748)","0.744 (0.740,0.748)","0.744 (0.740,0.747)","0.744 (0.740,0.748)"
Long LOS,2008 - 2016 [ID],AUPRC,"0.557 (0.550,0.563)","0.554* (0.548,0.561)","0.554* (0.548,0.561)","0.554* (0.548,0.561)","0.555 (0.549,0.562)","0.556 (0.550,0.563)","0.555 (0.549,0.561)","0.557 (0.550,0.563)","0.558 (0.551,0.564)","0.557 (0.550,0.563)","0.556 (0.550,0.563)","0.557 (0.551,0.564)","0.556 (0.550,0.563)"
Long LOS,2008 - 2016 [ID],Calibration,"0.022 (0.021,0.023)","0.022 (0.021,0.024)","0.029* (0.028,0.031)","0.023 (0.022,0.025)","0.022 (0.021,0.024)","0.021 (0.020,0.022)","0.021 (0.020,0.023)","0.022 (0.021,0.023)","0.022 (0.021,0.024)","0.022 (0.021,0.023)","0.022 (0.021,0.023)","0.022 (0.021,0.024)","0.022 (0.020,0.023)"
Sepsis,2008 - 2016 [ID],AUROC,"0.815 (0.810,0.821)","0.815 (0.810,0.821)","0.803* (0.797,0.809)","0.817 (0.811,0.823)","0.812* (0.806,0.818)","0.815 (0.809,0.821)","0.816 (0.810,0.822)","0.816 (0.810,0.822)","0.815 (0.809,0.821)","0.815 (0.809,0.821)","0.815 (0.810,0.821)","0.816 (0.810,0.822)","0.816 (0.810,0.822)"
Sepsis,2008 - 2016 [ID],AUPRC,"0.423 (0.410,0.436)","0.426 (0.412,0.439)","0.391* (0.378,0.404)","0.425 (0.411,0.438)","0.423 (0.409,0.436)","0.422 (0.409,0.436)","0.427 (0.414,0.441)","0.425 (0.412,0.439)","0.426 (0.412,0.439)","0.430* (0.416,0.443)","0.422 (0.408,0.435)","0.423 (0.409,0.436)","0.425 (0.412,0.439)"
Sepsis,2008 - 2016 [ID],Calibration,"0.013 (0.011,0.014)","0.013 (0.012,0.015)","0.019* (0.017,0.021)","0.013 (0.011,0.015)","0.014 (0.012,0.016)","0.013 (0.012,0.015)","0.013 (0.012,0.015)","0.013 (0.011,0.014)","0.012 (0.011,0.014)","0.011* (0.010,0.013)","0.013 (0.011,0.015)","0.013 (0.011,0.014)","0.012 (0.011,0.014)"
Mortality,2008 - 2016 [ID],AUROC,"0.883 (0.879,0.887)","0.883 (0.879,0.887)","0.852* (0.846,0.857)","0.883 (0.879,0.887)","0.883 (0.879,0.888)","0.883 (0.878,0.887)","0.883 (0.878,0.887)","0.882 (0.878,0.887)","0.883 (0.879,0.887)","0.883 (0.879,0.887)","0.884 (0.879,0.888)","0.883 (0.878,0.887)","0.883 (0.879,0.887)"
Mortality,2008 - 2016 [ID],AUPRC,"0.425 (0.412,0.438)","0.425 (0.412,0.438)","0.415* (0.402,0.428)","0.427 (0.414,0.440)","0.429* (0.416,0.442)","0.427 (0.414,0.439)","0.427 (0.414,0.440)","0.425 (0.412,0.438)","0.427 (0.414,0.440)","0.426 (0.414,0.439)","0.426 (0.413,0.438)","0.425 (0.412,0.438)","0.428 (0.415,0.441)"
Mortality,2008 - 2016 [ID],Calibration,"0.016 (0.015,0.018)","0.015* (0.014,0.017)","0.044* (0.042,0.045)","0.016 (0.015,0.018)","0.017* (0.016,0.018)","0.017* (0.016,0.018)","0.015* (0.014,0.016)","0.017 (0.015,0.018)","0.017* (0.016,0.018)","0.016 (0.015,0.017)","0.016* (0.015,0.017)","0.016 (0.015,0.017)","0.017 (0.015,0.018)"
Invasive Ventilation,2008 - 2016 [ID],AUROC,"0.844 (0.839,0.850)","0.844 (0.838,0.849)","0.844 (0.838,0.849)","0.845 (0.839,0.850)","0.845 (0.840,0.851)","0.844 (0.838,0.849)","0.844 (0.839,0.850)","0.844 (0.838,0.849)","0.845 (0.839,0.850)","0.845 (0.840,0.851)","0.844 (0.839,0.850)","0.832* (0.826,0.838)","0.845 (0.840,0.851)"


In [10]:
df_results_all['best']

Unnamed: 0_level_0,Unnamed: 1_level_0,Framework,Unnamed: 3_level_0,Domain Generalization,Domain Generalization,Domain Generalization,Domain Generalization,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation,Domain Adaptation
Unnamed: 0_level_1,Unnamed: 1_level_1,Unlabelled OOD Samples,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,100.0,100.0,500.0,500.0,1000.0,1000.0,1500.0,1500.0
Unnamed: 0_level_2,Unnamed: 1_level_2,Method,ERM,IRM,GroupDRO,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL,AL,CORAL
Task,Year Group,Metric,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Long LOS,2008 - 2016 [ID],AUROC,"0.746 (0.730,0.762)","0.742 (0.726,0.758)","0.741 (0.725,0.757)","0.741 (0.725,0.758)","0.745 (0.729,0.761)","0.744 (0.728,0.760)","0.741 (0.725,0.757)","0.744 (0.727,0.760)","0.745 (0.729,0.761)","0.743 (0.727,0.759)","0.743 (0.727,0.759)","0.746 (0.729,0.762)","0.744 (0.728,0.760)"
Long LOS,2008 - 2016 [ID],AUPRC,"0.560 (0.531,0.589)","0.554 (0.524,0.583)","0.557 (0.527,0.585)","0.556 (0.527,0.585)","0.558 (0.529,0.587)","0.556 (0.526,0.584)","0.555 (0.525,0.583)","0.555 (0.525,0.584)","0.559 (0.530,0.588)","0.553 (0.524,0.582)","0.557 (0.528,0.586)","0.558 (0.529,0.587)","0.559 (0.529,0.588)"
Long LOS,2008 - 2016 [ID],Calibration,"0.020 (0.017,0.025)","0.024 (0.020,0.036)","0.026 (0.021,0.035)","0.023 (0.019,0.031)","0.021 (0.019,0.026)","0.022 (0.019,0.034)","0.019 (0.017,0.026)","0.021 (0.018,0.028)","0.020 (0.018,0.028)","0.022 (0.018,0.034)","0.020 (0.018,0.029)","0.019 (0.018,0.026)","0.021 (0.018,0.033)"
Sepsis,2008 - 2016 [ID],AUROC,"0.814 (0.787,0.840)","0.819 (0.793,0.845)","0.804 (0.775,0.831)","0.817 (0.790,0.842)","0.811 (0.783,0.837)","0.812 (0.785,0.839)","0.821 (0.794,0.846)","0.817 (0.790,0.843)","0.808 (0.780,0.835)","0.814 (0.787,0.840)","0.818 (0.791,0.844)","0.812 (0.785,0.838)","0.814 (0.786,0.840)"
Sepsis,2008 - 2016 [ID],AUPRC,"0.432 (0.372,0.492)","0.427 (0.367,0.489)","0.399 (0.341,0.458)","0.421 (0.362,0.483)","0.433 (0.373,0.491)","0.438 (0.378,0.496)","0.430 (0.371,0.490)","0.435 (0.375,0.495)","0.421 (0.361,0.482)","0.421 (0.361,0.482)","0.410 (0.351,0.471)","0.438 (0.379,0.496)","0.420 (0.360,0.481)"
Sepsis,2008 - 2016 [ID],Calibration,"0.008 (0.006,0.016)","0.014 (0.006,0.023)","0.010 (0.006,0.019)","0.013 (0.007,0.023)","0.014 (0.006,0.024)","0.008 (0.006,0.016)","0.014 (0.007,0.023)","0.010 (0.006,0.019)","0.008 (0.006,0.015)","0.009 (0.006,0.019)","0.017* (0.009,0.026)","0.008 (0.006,0.017)","0.011 (0.006,0.021)"
Mortality,2008 - 2016 [ID],AUROC,"0.883 (0.863,0.902)","0.883 (0.863,0.902)","0.860* (0.836,0.882)","0.884 (0.865,0.903)","0.884 (0.865,0.902)","0.885 (0.865,0.904)","0.886 (0.867,0.905)","0.880 (0.859,0.899)","0.883 (0.863,0.902)","0.884 (0.864,0.902)","0.884 (0.864,0.902)","0.884 (0.864,0.902)","0.883 (0.863,0.901)"
Mortality,2008 - 2016 [ID],AUPRC,"0.420 (0.363,0.477)","0.428 (0.370,0.486)","0.425 (0.366,0.482)","0.429 (0.372,0.487)","0.429 (0.372,0.486)","0.430 (0.373,0.487)","0.422 (0.366,0.480)","0.423 (0.366,0.480)","0.428 (0.370,0.486)","0.423 (0.366,0.480)","0.435 (0.377,0.493)","0.427 (0.370,0.485)","0.424 (0.367,0.481)"
Mortality,2008 - 2016 [ID],Calibration,"0.017 (0.011,0.022)","0.016 (0.010,0.022)","0.037* (0.030,0.044)","0.017 (0.012,0.023)","0.017 (0.011,0.023)","0.016 (0.010,0.022)","0.013 (0.009,0.019)","0.016 (0.011,0.022)","0.019 (0.014,0.025)","0.013 (0.009,0.019)","0.015 (0.009,0.020)","0.018 (0.012,0.023)","0.016 (0.010,0.021)"
Invasive Ventilation,2008 - 2016 [ID],AUROC,"0.846 (0.822,0.870)","0.845 (0.820,0.869)","0.847 (0.822,0.871)","0.846 (0.821,0.869)","0.848 (0.823,0.871)","0.844 (0.819,0.868)","0.843 (0.817,0.867)","0.845 (0.820,0.869)","0.845 (0.819,0.869)","0.848 (0.823,0.872)","0.847 (0.822,0.870)","0.845 (0.819,0.868)","0.846 (0.821,0.870)"
