In [1]:
import os
import pandas as pd

from src.utils import list_dir
from src.default_paths import path_root

In [2]:
path = os.path.join(path_root, "data")
path_labels = os.path.join(path, "labels")

In [3]:
df_cohort = pd.read_csv(os.path.join(path, "cohort/ip_cohort/cohort.csv"))
df_cohort = df_cohort.query("age_at_admission_days>=28 and discharge_datetime > index_datetime")

In [4]:
df_cohort.shape

(37960, 6)

In [5]:
q1, q2, q3 = (df_cohort["age_at_admission_days"]/365).round().quantile([0.25, 0.5, 0.75]).values
print(f"Age median = {q2}, IQR = [{q1}, {q3}]")

Age median = 7.0, IQR = [2.0, 13.0]


In [6]:
sex_counts = df_cohort["sex"].value_counts().to_dict()
print(f"Sex: {sex_counts}")

Sex: {'M': 20507, 'F': 17449, 'OTHER': 4}


In [10]:
{k:round(v/df_cohort.shape[0]*100,1) for k,v in sex_counts.items()}

{'M': 54.0, 'F': 46.0, 'OTHER': 0.0}

In [25]:
labels = list_dir(path_labels)
labels.sort()

for label in labels:
    try:
        df = (
            pd.read_csv(os.path.join(path_labels, label, "labeled_patients.csv"))
        )
        
        num_patients = df.shape[0]
        num_pos = df.value.sum()
        perc_pos = round(num_pos/num_patients*100,1)
        
        if "min_max_value" in df.columns:
            IQR = df["min_max_value"].quantile([0.25, 0.5, 0.75]).values.round(1)
            num_measurements = (~df["min_max_value"].isnull()).sum()
            perc_measurements = round(num_measurements/num_patients, 3)
        else:
            IQR = "N/A"
            num_measurements = "N/A"
            perc_measurements = "N/A"

        print(f"{label}: {num_patients=}, {num_pos=} ({perc_pos}%), {num_measurements=}, {perc_measurements}, {IQR=}")
        
    except:
        continue
        
    

anemia_lab_sql: num_patients=37591, num_pos=1073 (2.9%), num_measurements=24669, 0.656, IQR=array([ 95., 111., 125.])
hyperkalemia_lab_sql: num_patients=37564, num_pos=352 (0.9%), num_measurements=23503, 0.626, IQR=array([4.1, 4.5, 5. ])
hypoglycemia_lab_sql: num_patients=37722, num_pos=459 (1.2%), num_measurements=20484, 0.543, IQR=array([4.5, 5.1, 5.8])
hyponatremia_lab_sql: num_patients=37880, num_pos=92 (0.2%), num_measurements=24052, 0.635, IQR=array([136., 138., 140.])
long_los_sql: num_patients=37960, num_pos=6115 (16.1%), num_measurements='N/A', N/A, IQR='N/A'
mortality_sql: num_patients=37960, num_pos=216 (0.6%), num_measurements='N/A', N/A, IQR='N/A'
readmission_sql: num_patients=37914, num_pos=2275 (6.0%), num_measurements='N/A', N/A, IQR='N/A'
thrombocytopenia_lab_sql: num_patients=37731, num_pos=726 (1.9%), num_measurements=24541, 0.65, IQR=array([190., 262., 346.])
