# Balanced Accuracy and Confidence Intervals

## Balanced Accuracy
The balanced accuracy can be calculated at the individual label level, but also at the various levels of grouping. However, the balanced accuracy score may only be accurate on the total dataset, as stratification influences the False Negative and False Positive rates of labeling. 

Furthermore, the dataset is imbalanced in data availability: some labels are sparsely available across hospitals and are not well predictable. Therefore, we also calculate the balanced accuracy when only evaluating classes which are available in at least the training hospitals and 10 test hospitals.

In [1]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, top_k_accuracy_score

from scipy.stats import bootstrap

### Balanced Accuracy on full dataset top-1 prediction for the 4-hospital model excluding overlapping parameter names

In [2]:
from pathlib import Path

In [3]:
ROOT = Path("..")

DIR = os.path.join(ROOT, Path("reports/output/base/non_overlapping"))
RESULT_DIR = os.path.join(DIR, "balanced_accuracy")
os.makedirs(RESULT_DIR, exist_ok=True)

HOSP_TO_ID_DICT = joblib.load(os.path.join(ROOT, Path("reports/HOSP_TO_ID_DICT.pkl")))

RESULTS = {
    "accuracy": {},
    "balanced_accuracy": {},
}

files = {
    "df_test": os.path.join(DIR, "test_data.csv"),
    "df_train": os.path.join(DIR, "train_data.csv"),
    "predicted_labels": os.path.join(DIR, "predicted_labels.csv"),
    "concepts": os.path.join(DIR, Path("../../../../data/input/concepts.csv")),
}

concepts = pd.read_csv(files["concepts"])
df_test = pd.read_csv(files["df_test"])
df_train = pd.read_csv(files["df_train"])
predicted_labels = pd.read_csv(files["predicted_labels"])

df_test["hospital_name"] = df_test["hospital_name"].map(HOSP_TO_ID_DICT)
df_train["hospital_name"] = df_train["hospital_name"].map(HOSP_TO_ID_DICT)

df = predicted_labels.merge(df_test[['id', 'hospital_name', 'ehr_name']],
                       on="id"
                      ).sort_values(['id', 'value'], ascending=[True, False]).drop(columns=['Unnamed: 0']).copy()
df['rank'] = df.groupby(['id'])['value'].cumcount() + 1
df['relevance'] = df['concept_label_original'].isin(['unmapped']).map({True: "irrelevant", False: "relevant"})


data = df.loc[(df['rank'] == 1)]

acc = accuracy_score(
    y_true=data['concept_label_original'],
    y_pred=data['label'],
)

bacc = balanced_accuracy_score(
    y_true=data['concept_label_original'],
    y_pred=data['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

print(data.groupby(["relevance"]).apply(lambda x: accuracy_score(x["concept_label_original"], x["label"])))
print(data.groupby(["relevance"]).apply(lambda x: balanced_accuracy_score(x["concept_label_original"], x["label"])))


accuracy = 0.7321547055927742
balanced accuracy = 0.34994427569582476
relevance
irrelevant    0.919830
relevant      0.504031
dtype: float64
relevance
irrelevant    0.919830
relevant      0.349553
dtype: float64




In [4]:
def top_k_accuracy_score(data, y_true, y_pred, parameter_id="id", proba="value", k=5):
    _data = data.sort_values([parameter_id, proba], ascending=[True, False])
    _data_k = data.groupby(parameter_id).head(k)
    _correct = _data_k[y_true] == _data_k[y_pred]
    _sum = _correct.groupby(_data_k[parameter_id]).max().sum()
    _count = _data_k[parameter_id].nunique()
    _top_k_accuracy= _sum/_count
    return _top_k_accuracy


def calculate_confidence_intervals(data, label_true, label_pred, group_by, label_rank="rank"):
    
    data_grouped = data.loc[data[label_rank] == 1].groupby(group_by)
    data_grouped_full = data.groupby(group_by)
    
    data_results = pd.DataFrame(data_grouped.apply(
        lambda x: (
            precision_score(y_true=x[label_true], y_pred=x[label_pred], average="weighted", zero_division=0),
            recall_score(y_true=x[label_true], y_pred=x[label_pred], average="weighted", zero_division=0),
            f1_score(y_true=x[label_true], y_pred=x[label_pred], average="weighted"),
            balanced_accuracy_score(y_true=x[label_true], y_pred=x[label_pred]),
        )
    ).to_list(), columns=["precision", "recall", "f1_score", "balanced_accuracy"])

    data_top_5_acc = pd.DataFrame(data_grouped_full.apply(
        lambda x: (
            top_k_accuracy_score(x, y_true=label_true, y_pred=label_pred, k=5),
        )
    ).to_list(), columns=["top_5_accuracy"])

    data_results = pd.concat([data_results, data_top_5_acc], axis=1)
    
    data_results_dict = data_results.apply(
        lambda x: (
            bootstrap((x.values,), np.mean, random_state=42),
            np.mean(x),
        ), axis=0
    ).to_dict()
    data_to_return = dict()
    for i, t in data_results_dict.items():
        m = t[1]
        l = t[0].confidence_interval.low
        h = t[0].confidence_interval.high
        print_string = f"[{round(l, 3)}, {round(m, 3)}, {round(h, 3)}]"
        print(f"{i}: {print_string}")
        data_to_return[i] = {
            "mean": m,
            "lower": l,
            "upper": h,
            "text": print_string
        }
    return data_to_return

    

In [5]:
top_k_accuracy_score(df, parameter_id="id", y_true="concept_label_original", y_pred="label", k=1)

0.7321547055927742

In [6]:
ci_dict = {}
ci_dict[(4, "non_overlap", "all")] = calculate_confidence_intervals(df, "concept_label_original", "label", ["hospital_name"])



precision: [0.648, 0.684, 0.722]
recall: [0.665, 0.697, 0.73]
f1_score: [0.627, 0.662, 0.701]
balanced_accuracy: [0.43, 0.473, 0.509]
top_5_accuracy: [0.834, 0.857, 0.877]


In [7]:
ci_dict[(4, "non_overlap", "relevant")] = calculate_confidence_intervals(df.loc[df["relevance"] == "relevant"], "concept_label_original", "label", ["hospital_name"])



precision: [0.526, 0.598, 0.664]
recall: [0.428, 0.487, 0.54]
f1_score: [0.445, 0.507, 0.561]
balanced_accuracy: [0.428, 0.472, 0.508]
top_5_accuracy: [0.669, 0.724, 0.764]


### Balanced Accuracy on the full dataset top-1 prediction for the 4-hospital model

In [8]:
ROOT = ".."

DIR = os.path.join(ROOT, "reports/output/base/overlapping")
RESULT_DIR = os.path.join(DIR, "balanced_accuracy")
os.makedirs(RESULT_DIR, exist_ok=True)

HOSP_TO_ID_DICT = joblib.load(os.path.join(ROOT, "reports/HOSP_TO_ID_DICT.pkl"))

RESULTS = {
    "accuracy": {},
    "balanced_accuracy": {},
}

files = {
    "df_test": os.path.join(DIR, "test_data.csv"),
    "df_train": os.path.join(DIR, "train_data.csv"),
    "predicted_labels": os.path.join(DIR, "predicted_labels.csv"),
    "concepts": os.path.join(DIR, "../../../../data/input/concepts.csv"),
}

concepts = pd.read_csv(files["concepts"])
df_test = pd.read_csv(files["df_test"])
df_train = pd.read_csv(files["df_train"])
predicted_labels = pd.read_csv(files["predicted_labels"])

df_test["hospital_name"] = df_test["hospital_name"].map(HOSP_TO_ID_DICT)
df_train["hospital_name"] = df_train["hospital_name"].map(HOSP_TO_ID_DICT)

In [9]:
df = predicted_labels.merge(df_test[['id', 'hospital_name', 'ehr_name']],
                       on="id"
                      ).sort_values(['id', 'value'], ascending=[True, False]).drop(columns=['Unnamed: 0']).copy()
df['rank'] = df.groupby(['id'])['value'].cumcount() + 1
df['relevance'] = df['concept_label_original'].isin(['unmapped']).map({True: "irrelevant", False: "relevant"})

In [10]:
data = df.loc[df['rank'] == 1]

acc = accuracy_score(
    y_true=data['concept_label_original'],
    y_pred=data['label'],
)

bacc = balanced_accuracy_score(
    y_true=data['concept_label_original'],
    y_pred=data['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

accuracy = 0.7706958710291817
balanced accuracy = 0.42371196811386697




In [11]:
ci_dict[(4, "overlap", "all")] = calculate_confidence_intervals(df, "concept_label_original", "label", ["hospital_name"])



precision: [0.693, 0.733, 0.766]
recall: [0.72, 0.751, 0.778]
f1_score: [0.684, 0.72, 0.751]
balanced_accuracy: [0.534, 0.597, 0.65]
top_5_accuracy: [0.864, 0.889, 0.906]


In [12]:
ci_dict[(4, "overlap", "relevant")] = calculate_confidence_intervals(df.loc[df["relevance"] == "relevant"], "concept_label_original", "label", ["hospital_name"])



precision: [0.592, 0.673, 0.742]
recall: [0.512, 0.581, 0.643]
f1_score: [0.525, 0.597, 0.66]
balanced_accuracy: [0.533, 0.596, 0.649]
top_5_accuracy: [0.727, 0.783, 0.827]


### Balanced Accuracy for concepts available in the training dataset AND at least N training hospitals
- Parameter table, for training hospitals, retrieve all used concept labels
- Parameter table, for testing hospitals, retrieve only parameters with concept labels used in the training set
- Parameter table, for testing hospitals, count the number of hospitals per concept label
- Parameter table, for testing hospitals, filter on counts >= N

- Left join with predicted labels at rank 1
- Calculate accuracy and balanced accuracy

In [13]:
train_labels = list(df_train.concept_label.unique())
filtered_data = df.loc[df["concept_label_original"].isin(train_labels)]
print(f"data reduced from {df.shape[0]} to {filtered_data.shape[0]} based on train label overlap dropping {df.shape[0] - filtered_data.shape[0]} records.")
print(f"concepts reduced from {df.concept_label_original.nunique()} to {filtered_data.concept_label_original.nunique()}, dropping {df.concept_label_original.nunique() - filtered_data.concept_label_original.nunique()} labels")

filtered_data_rank_1 = filtered_data.loc[filtered_data["rank"] == 1]

acc = accuracy_score(
    y_true=filtered_data_rank_1['concept_label_original'],
    y_pred=filtered_data_rank_1['label'],
)

bacc = balanced_accuracy_score(
    y_true=filtered_data_rank_1['concept_label_original'],
    y_pred=filtered_data_rank_1['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

data reduced from 743327 to 718217 based on train label overlap dropping 25110 records.
concepts reduced from 1521 to 1127, dropping 394 labels
accuracy = 0.7846260387811634
balanced accuracy = 0.5718419729380582




In [14]:
calculate_confidence_intervals(filtered_data, "concept_label_original", "label", ["hospital_name"])



precision: [0.723, 0.76, 0.79]
recall: [0.738, 0.768, 0.793]
f1_score: [0.708, 0.742, 0.772]
balanced_accuracy: [0.58, 0.644, 0.696]
top_5_accuracy: [0.886, 0.91, 0.924]


{'precision': {'mean': 0.7604638340604223,
  'lower': 0.7229193083301892,
  'upper': 0.7904489233075386,
  'text': '[0.723, 0.76, 0.79]'},
 'recall': {'mean': 0.7681703997759685,
  'lower': 0.7378221232589843,
  'upper': 0.7934479510169009,
  'text': '[0.738, 0.768, 0.793]'},
 'f1_score': {'mean': 0.7422754436109927,
  'lower': 0.7079914861943318,
  'upper': 0.7716367149030854,
  'text': '[0.708, 0.742, 0.772]'},
 'balanced_accuracy': {'mean': 0.6442317668259305,
  'lower': 0.5803420852248524,
  'upper': 0.6956324892867055,
  'text': '[0.58, 0.644, 0.696]'},
 'top_5_accuracy': {'mean': 0.9098325486097502,
  'lower': 0.8864789409836786,
  'upper': 0.9243105798720765,
  'text': '[0.886, 0.91, 0.924]'}}

In [15]:
filtered_data_concept_labels_num = filtered_data.concept_label_original.nunique()
N = 5
filtered_data_concept_labels_above_N = list(filtered_data.groupby(["concept_label_original"])["hospital_name"].nunique()[filtered_data.groupby(["concept_label_original"])["hospital_name"].nunique() >= N].index)
print(f"Reduced the number of concepts used from {filtered_data_concept_labels_num} to {len(filtered_data_concept_labels_above_N)}")

double_filtered_data = filtered_data.loc[filtered_data["concept_label_original"].isin(filtered_data_concept_labels_above_N)]

double_filtered_data_rank_1 = double_filtered_data.loc[double_filtered_data["rank"] == 1]

acc = accuracy_score(
    y_true=double_filtered_data_rank_1['concept_label_original'],
    y_pred=double_filtered_data_rank_1['label'],
)

bacc = balanced_accuracy_score(
    y_true=double_filtered_data_rank_1['concept_label_original'],
    y_pred=double_filtered_data_rank_1['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

Reduced the number of concepts used from 1127 to 708
accuracy = 0.7970516495451065
balanced accuracy = 0.6461270720243716




In [16]:
ci_dict[(4, "overlap", "all-filtered")] = calculate_confidence_intervals(double_filtered_data, "concept_label_original", "label", ["hospital_name"])



precision: [0.75, 0.779, 0.805]
recall: [0.751, 0.779, 0.803]
f1_score: [0.726, 0.757, 0.784]
balanced_accuracy: [0.599, 0.662, 0.711]
top_5_accuracy: [0.9, 0.918, 0.93]


In [17]:
ci_dict[(4, "overlap", "all-filtered-relevant")] = calculate_confidence_intervals(double_filtered_data.loc[double_filtered_data["relevance"] == "relevant"], "concept_label_original", "label", ["hospital_name"])



precision: [0.661, 0.733, 0.792]
recall: [0.56, 0.623, 0.678]
f1_score: [0.58, 0.645, 0.701]
balanced_accuracy: [0.598, 0.661, 0.71]
top_5_accuracy: [0.793, 0.835, 0.869]


### Balanced Accuracy stratified for:
- Relevance
- Concept Category
- Hospital/EHR
- Hospital/EHR/Relevance

For this, we need select the highest predicted label for each parameter, join with parameter table to get the hospital names, join with concept table to get the concept groups, and label relevance.

#### Relevance

In [18]:
group_by = ["relevance"]
RESULTS["accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))
RESULTS["balanced_accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: balanced_accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))

print("ACCURACY")
print(RESULTS["accuracy"]["_".join(group_by)])
print("BALANCED ACCURACY")
print(RESULTS["balanced_accuracy"]["_".join(group_by)])

ACCURACY
relevance
irrelevant    0.922521
relevant      0.626286
dtype: float64
BALANCED ACCURACY
relevance
irrelevant    0.922521
relevant      0.423384
dtype: float64




#### Concept Category

In [19]:
concept_data = df.merge(concepts, left_on=["concept_label_original"], right_on=["concept_label"])

concept_data_rank_1 = concept_data.loc[concept_data["rank"] == 1]

group_by = ["category"]
RESULTS["accuracy"]["_".join(group_by)] = concept_data_rank_1.groupby(group_by).apply(lambda x: accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))
RESULTS["balanced_accuracy"]["_".join(group_by)] = concept_data_rank_1.groupby(group_by).apply(lambda x: balanced_accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))

print("ACCURACY")
print(RESULTS["accuracy"]["_".join(group_by)])
print("BALANCED ACCURACY")
print(RESULTS["balanced_accuracy"]["_".join(group_by)])

ACCURACY
category
admission information        0.065574
clinical score               0.637275
demographics                 0.531599
fluid balance                0.470245
hemodynamics                 0.783564
infectiology                 0.608000
laboratory value             0.358995
lda                          0.627451
medication                   0.853923
neurology                    0.542683
nice data                    0.126354
position                     0.595611
renal replacement therapy    0.351421
respiratory                  0.634578
sofa score                   0.500000
unmapped                     0.922521
dtype: float64
BALANCED ACCURACY
category
admission information        0.082011
clinical score               0.504094
demographics                 0.453205
fluid balance                0.216301
hemodynamics                 0.472056
infectiology                 0.314985
laboratory value             0.248257
lda                          0.263581
medication                  



In [20]:
concept_data.head()

Unnamed: 0,id,parameter_name,label,value,concept_label_original,hospital_name,ehr_name,rank,relevance,concept_label,concept_label_super,category
0,0,Kalium,potassium_unspecified,0.375522,potassium_unspecified,0,MV,1,relevant,potassium_unspecified,potassium,laboratory value
1,0,Kalium,potassium_urine,0.152596,potassium_unspecified,0,MV,2,relevant,potassium_unspecified,potassium,laboratory value
2,0,Kalium,potassium_blood,0.054164,potassium_unspecified,0,MV,3,relevant,potassium_unspecified,potassium,laboratory value
3,0,Kalium,fluid_in_oral,0.048629,potassium_unspecified,0,MV,4,relevant,potassium_unspecified,potassium,laboratory value
4,0,Kalium,unmapped,0.047877,potassium_unspecified,0,MV,5,relevant,potassium_unspecified,potassium,laboratory value


In [21]:
d = concept_data.groupby(["category", "hospital_name"])
d_rank = concept_data.loc[concept_data["rank"] == 1].groupby(["category", "hospital_name"])

e = pd.DataFrame(d_rank.apply(
    lambda x: (
            precision_score(y_true=x["concept_label_original"], y_pred=x["label"], average="weighted", zero_division=0),
            recall_score(y_true=x["concept_label_original"], y_pred=x["label"], average="weighted", zero_division=0),
            f1_score(y_true=x["concept_label_original"], y_pred=x["label"], average="weighted"),
            balanced_accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]),
)).to_list(), columns=["precision", "recall", "f1_score", "balanced_accuracy"], index=d.groups.keys())

e2 = pd.DataFrame(d.apply(
    lambda x: (
        top_k_accuracy_score(data=x, y_true="concept_label_original", y_pred="label", k=5),
    )
).to_list(), columns=["top_5_accuracy"], index=d.groups.keys())

e = pd.concat([e, e2], axis=1)
e



Unnamed: 0,Unnamed: 1,precision,recall,f1_score,balanced_accuracy,top_5_accuracy
admission information,0,0.0,0.000000,0.000000,0.000000,0.000000
admission information,2,0.0,0.000000,0.000000,0.000000,0.000000
admission information,3,0.4,0.200000,0.266667,0.125000,0.400000
admission information,8,1.0,1.000000,1.000000,1.000000,1.000000
admission information,10,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...
unmapped,24,1.0,0.963262,0.981287,0.963262,0.992832
unmapped,25,1.0,0.889860,0.941721,0.889860,0.984266
unmapped,26,1.0,0.931699,0.964642,0.931699,0.991243
unmapped,28,1.0,0.882870,0.937792,0.882870,0.961933


In [22]:
f = e.reset_index().drop(columns="level_1").groupby(["level_0"])
concept_cat_results = {}
CC_RESULTS = {}
for i, _df in f:
    concept_cat_results[i] = _df[["precision", "recall", "f1_score", "balanced_accuracy", "top_5_accuracy"]].apply(
        lambda x: (
            bootstrap((x.values,), np.mean, random_state=42),
            np.mean(x),
        ),
        axis=0,
    )
    ddf = concept_cat_results[i]
    

    for col in ddf.columns:
        ci = ddf[col][0]
        l = ci.confidence_interval.low
        h = ci.confidence_interval.high
        m = ddf[col][1]
        print_string = f"[{round(l, 3)}, {round(m, 3)}, {round(h, 3)}]"
        print(f"{i[0], col}: {print_string}")
        CC_RESULTS[(i[0], col)] = {
            "mean": m,
            "lower": l,
            "upper": h,
            "text": print_string,
        }
    
    

('admission information', 'precision'): [0.033, 0.167, 0.417]
('admission information', 'recall'): [0.017, 0.108, 0.431]
('admission information', 'f1_score'): [0.022, 0.12, 0.431]
('admission information', 'balanced_accuracy'): [0.01, 0.097, 0.431]
('admission information', 'top_5_accuracy'): [0.017, 0.125, 0.425]
('clinical score', 'precision'): [0.51, 0.641, 0.749]
('clinical score', 'recall'): [0.424, 0.53, 0.632]
('clinical score', 'f1_score'): [0.451, 0.564, 0.665]
('clinical score', 'balanced_accuracy'): [0.452, 0.569, 0.675]
('clinical score', 'top_5_accuracy'): [0.654, 0.775, 0.852]
('demographics', 'precision'): [0.527, 0.642, 0.741]
('demographics', 'recall'): [0.438, 0.535, 0.63]
('demographics', 'f1_score'): [0.454, 0.555, 0.65]
('demographics', 'balanced_accuracy'): [0.496, 0.589, 0.674]
('demographics', 'top_5_accuracy'): [0.767, 0.853, 0.915]
('fluid balance', 'precision'): [0.607, 0.697, 0.761]
('fluid balance', 'recall'): [0.364, 0.432, 0.493]
('fluid balance', 'f1_sc

  a_hat = 1/6 * sum(nums) / sum(dens)**(3/2)


In [23]:
pd.DataFrame(CC_RESULTS).T.reset_index().pivot(
    columns=["level_1"],
    index=["level_0"],
    values=["mean", "lower", "upper", "text"],
)

Unnamed: 0_level_0,mean,mean,mean,mean,mean,lower,lower,lower,lower,lower,upper,upper,upper,upper,upper,text,text,text,text,text
level_1,balanced_accuracy,f1_score,precision,recall,top_5_accuracy,balanced_accuracy,f1_score,precision,recall,top_5_accuracy,balanced_accuracy,f1_score,precision,recall,top_5_accuracy,balanced_accuracy,f1_score,precision,recall,top_5_accuracy
level_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
admission information,0.097222,0.119841,0.166667,0.108333,0.125,0.010417,0.022222,0.033333,0.016667,0.016667,0.430556,0.430952,0.416667,0.430519,0.425,"[0.01, 0.097, 0.431]","[0.022, 0.12, 0.431]","[0.033, 0.167, 0.417]","[0.017, 0.108, 0.431]","[0.017, 0.125, 0.425]"
clinical score,0.568766,0.564168,0.640747,0.530465,0.774701,0.452447,0.450572,0.51023,0.424248,0.654018,0.675087,0.665176,0.74854,0.632452,0.851968,"[0.452, 0.569, 0.675]","[0.451, 0.564, 0.665]","[0.51, 0.641, 0.749]","[0.424, 0.53, 0.632]","[0.654, 0.775, 0.852]"
demographics,0.588888,0.55533,0.642301,0.53512,0.853351,0.495925,0.453636,0.526688,0.437917,0.767294,0.673998,0.649562,0.740973,0.630398,0.914633,"[0.496, 0.589, 0.674]","[0.454, 0.555, 0.65]","[0.527, 0.642, 0.741]","[0.438, 0.535, 0.63]","[0.767, 0.853, 0.915]"
fluid balance,0.302134,0.502947,0.696598,0.431638,0.736297,0.254265,0.431039,0.607068,0.363862,0.670667,0.352753,0.565204,0.76127,0.493477,0.782736,"[0.254, 0.302, 0.353]","[0.431, 0.503, 0.565]","[0.607, 0.697, 0.761]","[0.364, 0.432, 0.493]","[0.671, 0.736, 0.783]"
hemodynamics,0.552072,0.584011,0.628771,0.573002,0.761243,0.472637,0.476004,0.515863,0.46937,0.687229,0.626166,0.695058,0.738838,0.680413,0.833783,"[0.473, 0.552, 0.626]","[0.476, 0.584, 0.695]","[0.516, 0.629, 0.739]","[0.469, 0.573, 0.68]","[0.687, 0.761, 0.834]"
infectiology,0.42799,0.477091,0.52173,0.466509,0.791883,0.32578,0.358277,0.391141,0.351804,0.694313,0.525054,0.589191,0.639848,0.575778,0.865581,"[0.326, 0.428, 0.525]","[0.358, 0.477, 0.589]","[0.391, 0.522, 0.64]","[0.352, 0.467, 0.576]","[0.694, 0.792, 0.866]"
laboratory value,0.483506,0.47387,0.569765,0.465166,0.750487,0.418312,0.401054,0.500559,0.391122,0.676618,0.528114,0.532658,0.635289,0.523688,0.798991,"[0.418, 0.484, 0.528]","[0.401, 0.474, 0.533]","[0.501, 0.57, 0.635]","[0.391, 0.465, 0.524]","[0.677, 0.75, 0.799]"
lda,0.716383,0.749434,0.848151,0.706264,0.904308,0.587415,0.603455,0.68391,0.569955,0.763867,0.831788,0.859524,0.961199,0.826453,0.969388,"[0.587, 0.716, 0.832]","[0.603, 0.749, 0.86]","[0.684, 0.848, 0.961]","[0.57, 0.706, 0.826]","[0.764, 0.904, 0.969]"
medication,0.822235,0.835073,0.854308,0.834412,0.912011,0.785518,0.787651,0.809471,0.784251,0.897611,0.852826,0.868876,0.884598,0.869032,0.926228,"[0.786, 0.822, 0.853]","[0.788, 0.835, 0.869]","[0.809, 0.854, 0.885]","[0.784, 0.834, 0.869]","[0.898, 0.912, 0.926]"
neurology,0.504266,0.515832,0.620879,0.483243,0.860528,0.407032,0.418989,0.500111,0.391424,0.762235,0.590821,0.60609,0.726511,0.565689,0.920075,"[0.407, 0.504, 0.591]","[0.419, 0.516, 0.606]","[0.5, 0.621, 0.727]","[0.391, 0.483, 0.566]","[0.762, 0.861, 0.92]"


#### Hospital/EHR

In [24]:
group_by = ["hospital_name", "ehr_name"]
RESULTS["accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))
RESULTS["balanced_accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: balanced_accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))

print("ACCURACY")
print(RESULTS["accuracy"]["_".join(group_by)])
print("BALANCED ACCURACY")
print(RESULTS["balanced_accuracy"]["_".join(group_by)])



ACCURACY
hospital_name  ehr_name
0              MV          0.781533
2              MV          0.757557
3              MV          0.555332
4              HIX         0.854478
5              HIX         0.755814
6              HIX         0.771852
8              EPIC        0.880705
9              HIX         0.834523
10             MV          0.694969
11             MV          0.730435
12             HIX         0.805164
13             MV          0.670792
14             HIX         0.805634
15             MV          0.687135
16             HIX         0.756978
17             MV          0.789308
18             HIX         0.779011
20             EPIC        0.873636
21             HIX         0.772651
22             HIX         0.621960
23             EPIC        0.834157
24             EPIC        0.712435
25             MV          0.665919
26             HIX         0.682292
28             HIX         0.710383
29             MV          0.742216
dtype: float64
BALANCED ACCURAC



In [25]:
RESULTS["balanced_accuracy"]["_".join(["hospital_name", "ehr_name"])].describe()

count    26.000000
mean      0.597157
std       0.153449
min       0.262034
25%       0.506066
50%       0.620528
75%       0.728340
max       0.793904
dtype: float64

In [26]:
raise

RuntimeError: No active exception to reraise

In [27]:
calculate_confidence_intervals(df, "concept_label_original", "label", group_by)



precision: [0.693, 0.733, 0.766]
recall: [0.72, 0.751, 0.778]
f1_score: [0.684, 0.72, 0.751]
balanced_accuracy: [0.534, 0.597, 0.65]
top_5_accuracy: [0.864, 0.889, 0.906]


{'precision': {'mean': 0.7331554843328731,
  'lower': 0.6934499438868211,
  'upper': 0.7664444978541297,
  'text': '[0.693, 0.733, 0.766]'},
 'recall': {'mean': 0.7510333398303577,
  'lower': 0.7202197661461759,
  'upper': 0.7778373253918943,
  'text': '[0.72, 0.751, 0.778]'},
 'f1_score': {'mean': 0.7196262878378403,
  'lower': 0.6842639523378266,
  'upper': 0.7508501183393211,
  'text': '[0.684, 0.72, 0.751]'},
 'balanced_accuracy': {'mean': 0.5971572510522323,
  'lower': 0.5342090784442876,
  'upper': 0.6501357362327018,
  'text': '[0.534, 0.597, 0.65]'},
 'top_5_accuracy': {'mean': 0.8894481189975841,
  'lower': 0.8638967951747535,
  'upper': 0.9059387369092805,
  'text': '[0.864, 0.889, 0.906]'}}

#### Hospital/EHR/Relevance

In [None]:
group_by = ["hospital_name", "ehr_name", "relevance"]
RESULTS["accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))
RESULTS["balanced_accuracy"]["_".join(group_by)] = data.groupby(group_by).apply(lambda x: balanced_accuracy_score(y_true=x["concept_label_original"], y_pred=x["label"]))

print("ACCURACY")
print(RESULTS["accuracy"]["_".join(group_by)])
print("BALANCED ACCURACY")
print(RESULTS["balanced_accuracy"]["_".join(group_by)])

In [None]:
calculate_confidence_intervals(data, "concept_label_original", "label", ["hospital_name"])

In [None]:
for path, result_dict in RESULTS.items():
    os.makedirs(os.path.join(RESULT_DIR, path), exist_ok=True)
    for filename, result in result_dict.items():
        result.to_csv(os.path.join(RESULT_DIR, path, f"{filename}.csv"))

In [None]:
RESULTS.keys()

## Balanced Accuracy on the full dataset top-1 prediction for the LOO model


In [34]:
dfs = [pd.read_csv(f"./output/all/results/{key}.csv") for key in HOSP_TO_ID_DICT.keys()]
for i in range(len(dfs)):
    dfs[i]["key"] = list(HOSP_TO_ID_DICT.values())[i]
df = pd.concat(dfs, axis=0)

df = df.sort_values(["key", "id", "value"], ascending=[True, True, False])
df["rank"] = df.groupby(['id'])['value'].cumcount() + 1

combined_data = pd.concat([df_test, df_train], axis=0)
dft = df.merge(combined_data[["id", "concept_label", "hospital_name", "ehr_name"]], left_on=['id'], right_on=['id'])
dft = dft.merge(concepts, on=["concept_label"])
dft['relevance'] = dft['concept_label'].isin(['unmapped']).map({True: "irrelevant", False: "relevant"})

data_rank_1 = dft.loc[dft['rank'] == 1]
acc = accuracy_score(
    y_true=data_rank_1['concept_label'],
    y_pred=data_rank_1['label'],
)

bacc = balanced_accuracy_score(
    y_true=data_rank_1['concept_label'],
    y_pred=data_rank_1['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

print(data_rank_1.groupby(["relevance"]).apply(lambda x: accuracy_score(x["concept_label"], x["label"])))
print(data_rank_1.groupby(["relevance"]).apply(lambda x: balanced_accuracy_score(x["concept_label"], x["label"])))

accuracy = 0.809891667642048
balanced accuracy = 0.39126161771706663
relevance
irrelevant    0.942342
relevant      0.680018
dtype: float64
relevance
irrelevant    0.942342
relevant      0.390933
dtype: float64




In [35]:
N = 5
print(f"{N = }")
print(f"{dft.concept_label.nunique() = }")
concepts_more_than_N_hospitals = list(dft.groupby(["concept_label"])["hospital_name"].nunique()[dft.groupby(["concept_label"])["hospital_name"].nunique() > N].keys())
print(f"{len(concepts_more_than_N_hospitals) = }")
data = data_rank_1.loc[dft['concept_label'].isin(concepts_more_than_N_hospitals)]
acc = accuracy_score(
    y_true=data['concept_label'],
    y_pred=data['label'],
)

bacc = balanced_accuracy_score(
    y_true=data['concept_label'],
    y_pred=data['label'],
)

print(f"accuracy = {acc}")
print(f"balanced accuracy = {bacc}")

N = 5
dft.concept_label.nunique() = 1680
len(concepts_more_than_N_hospitals) = 792
accuracy = 0.8341699024528838
balanced accuracy = 0.6531581978790781




In [55]:
ci_dict[("loo", "overlap", "all")] = calculate_confidence_intervals(dft, "concept_label", "label", ["hospital_name"])
ci_dict[("loo", "overlap", "all-relevant")] = calculate_confidence_intervals(dft.loc[dft["relevance"] == "relevant"], "concept_label", "label", ["hospital_name"])

ci_dict[("loo", "overlap", "all-filtered")] = calculate_confidence_intervals(dft.loc[dft['concept_label'].isin(concepts_more_than_N_hospitals)], "concept_label", "label", ["hospital_name"])





precision: [0.735, 0.769, 0.796]
recall: [0.773, 0.797, 0.817]
f1_score: [0.735, 0.764, 0.789]
balanced_accuracy: [0.584, 0.641, 0.692]
top_5_accuracy: [0.924, 0.941, 0.951]




precision: [0.646, 0.716, 0.777]
recall: [0.572, 0.634, 0.688]
f1_score: [0.585, 0.649, 0.705]
balanced_accuracy: [0.583, 0.64, 0.692]
top_5_accuracy: [0.838, 0.879, 0.905]




precision: [0.775, 0.807, 0.828]
recall: [0.796, 0.82, 0.839]
f1_score: [0.767, 0.795, 0.818]
balanced_accuracy: [0.651, 0.705, 0.75]
top_5_accuracy: [0.939, 0.956, 0.964]


In [37]:
ci_dict[("loo", "overlap", "all-filtered-relevant")] = calculate_confidence_intervals(dft.loc[dft["relevance"] == "relevant"], "concept_label", "label", ["hospital_name"])




precision: [0.646, 0.716, 0.777]
recall: [0.572, 0.634, 0.688]
f1_score: [0.585, 0.649, 0.705]
balanced_accuracy: [0.583, 0.64, 0.692]
top_5_accuracy: [0.838, 0.879, 0.905]


In [44]:
d = pd.DataFrame(dft.loc[(dft["relevance"] == "relevant") & (dft["rank"] == 1)].groupby(["hospital_name"]).apply(
    lambda x: (
        precision_score(x["concept_label"], x["label"], average="weighted"),
        recall_score(x["concept_label"], x["label"], average="weighted"),
        f1_score(x["concept_label"], x["label"], average="weighted"),
        balanced_accuracy_score(x["concept_label"], x["label"]),
    )
).to_list(), columns=["precision", "recall", "f1_score", "balanced_accuracy"])


d2 = pd.DataFrame(dft.loc[dft["relevance"] == "relevant"].groupby(["hospital_name"]).apply(
    lambda x: (
        top_k_accuracy_score(data=x, y_true="concept_label", y_pred="label", k=5),
    )
).to_list(), columns=["top_5_accuracy"])

d = pd.concat([d, d2], axis=1)
d

d.describe()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Unnamed: 0,precision,recall,f1_score,balanced_accuracy,top_5_accuracy
count,30.0,30.0,30.0,30.0,30.0
mean,0.715533,0.634024,0.649156,0.639827,0.879285
std,0.183288,0.164426,0.169429,0.154424,0.091028
min,0.307292,0.278125,0.283437,0.27668,0.59447
25%,0.593739,0.546438,0.549814,0.5132,0.865222
50%,0.753661,0.654952,0.664961,0.639769,0.907336
75%,0.882383,0.76421,0.799051,0.784563,0.938029
max,0.934326,0.856817,0.866304,0.842221,0.971871


In [45]:
print(dft.loc[dft["rank"] == 1].groupby(["hospital_name"]).apply(lambda x: precision_score(x["concept_label"], x["label"], average="weighted")).describe())
print(dft.loc[dft["rank"] == 1].groupby(["hospital_name"]).apply(lambda x: recall_score(x["concept_label"], x["label"], average="weighted")).describe())
print(dft.loc[dft["rank"] == 1].groupby(["hospital_name"]).apply(lambda x: f1_score(x["concept_label"], x["label"], average="weighted")).describe())

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

count    30.000000
mean      0.769177
std       0.084743
min       0.542012
25%       0.743123
50%       0.786169
75%       0.826024
max       0.890272
dtype: float64


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

count    30.000000
mean      0.796583
std       0.063016
min       0.649899
25%       0.750455
50%       0.811931
75%       0.829823
max       0.897666
dtype: float64
count    30.000000
mean      0.764415
std       0.076570
min       0.581925
25%       0.718270
50%       0.784669
75%       0.810085
max       0.890621
dtype: float64


In [46]:
print(dft.loc[dft["rank"] == 1].groupby(["hospital_name"]).apply(lambda x: balanced_accuracy_score(x["concept_label"], x["label"])).describe())



count    30.000000
mean      0.640630
std       0.153753
min       0.279307
25%       0.513788
50%       0.640353
75%       0.784784
max       0.842523
dtype: float64


In [51]:
dft.groupby(
    ["hospital_name"]
).apply(
    lambda x: top_k_accuracy_score(
        x,
        "concept_label",
        "label",
        k=5,
    )
).describe()

count    30.000000
mean      0.941145
std       0.036170
min       0.804829
25%       0.931887
50%       0.950484
75%       0.962138
max       0.981494
dtype: float64

## Show CI dict

In [56]:
ci_dict_2 = {}
for i, values in ci_dict.items():
    ci_dict_2[i] = {}
    for j, v in values.items():
        for k, l in v.items():
            ci_dict_2[i][(j, k)] = l

In [57]:
pd.DataFrame(ci_dict_2).xs("text", level=1, drop_level=False).T

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precision,recall,f1_score,balanced_accuracy,top_5_accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,text,text,text,text,text
4,non_overlap,all,"[0.648, 0.684, 0.722]","[0.665, 0.697, 0.73]","[0.627, 0.662, 0.701]","[0.43, 0.473, 0.509]","[0.834, 0.857, 0.877]"
4,non_overlap,relevant,"[0.526, 0.598, 0.664]","[0.428, 0.487, 0.54]","[0.445, 0.507, 0.561]","[0.428, 0.472, 0.508]","[0.669, 0.724, 0.764]"
4,overlap,all,"[0.693, 0.733, 0.766]","[0.72, 0.751, 0.778]","[0.684, 0.72, 0.751]","[0.534, 0.597, 0.65]","[0.864, 0.889, 0.906]"
4,overlap,relevant,"[0.592, 0.673, 0.742]","[0.512, 0.581, 0.643]","[0.525, 0.597, 0.66]","[0.533, 0.596, 0.649]","[0.727, 0.783, 0.827]"
4,overlap,all-filtered,"[0.75, 0.779, 0.805]","[0.751, 0.779, 0.803]","[0.726, 0.757, 0.784]","[0.599, 0.662, 0.711]","[0.9, 0.918, 0.93]"
4,overlap,all-filtered-relevant,"[0.661, 0.733, 0.792]","[0.56, 0.623, 0.678]","[0.58, 0.645, 0.701]","[0.598, 0.661, 0.71]","[0.793, 0.835, 0.869]"
loo,overlap,all-filtered,"[0.775, 0.807, 0.828]","[0.796, 0.82, 0.839]","[0.767, 0.795, 0.818]","[0.651, 0.705, 0.75]","[0.939, 0.956, 0.964]"
loo,overlap,all-filtered-relevant,"[0.646, 0.716, 0.777]","[0.572, 0.634, 0.688]","[0.585, 0.649, 0.705]","[0.583, 0.64, 0.692]","[0.838, 0.879, 0.905]"
loo,overlap,all,"[0.735, 0.769, 0.796]","[0.773, 0.797, 0.817]","[0.735, 0.764, 0.789]","[0.584, 0.641, 0.692]","[0.924, 0.941, 0.951]"
loo,overlap,all-relevant,"[0.646, 0.716, 0.777]","[0.572, 0.634, 0.688]","[0.585, 0.649, 0.705]","[0.583, 0.64, 0.692]","[0.838, 0.879, 0.905]"
