In [1]:
import os
import pandas as pd

from src.utils import list_dir
from src.default_paths import path_root

In [2]:
path = os.path.join(path_root, "data")
path_labels = os.path.join(path, "labels")

In [3]:
df_cohort = pd.read_csv(os.path.join(path, "cohort/ip_cohort/cohort.csv"))
df_cohort = df_cohort.query("discharge_datetime > index_datetime")

In [4]:
df_cohort.shape[0]

44055

In [5]:
sex_counts = df_cohort["sex"].value_counts().to_dict()
print(f"Sex: {sex_counts}")

Sex: {'F': 26726, 'M': 17329}


In [6]:
{k:round(v/df_cohort.shape[0]*100,1) for k,v in sex_counts.items()}

{'F': 60.7, 'M': 39.3}

In [7]:
labels = list_dir(path_labels)
labels.sort()

for label in labels:
    try:
        df = (
            pd.read_csv(os.path.join(path_labels, label, "labeled_patients.csv"))
        )
        
        num_patients = df.shape[0]
        num_pos = df.value.sum()
        perc_pos = round(num_pos/num_patients*100,1)
        
        if "min_max_value" in df.columns:
            IQR = df["min_max_value"].quantile([0.25, 0.5, 0.75]).values.round(1)
            num_measurements = (~df["min_max_value"].isnull()).sum()
            perc_measurements = round(num_measurements/num_patients, 3)
        else:
            IQR = "N/A"
            num_measurements = "N/A"
            perc_measurements = "N/A"

        print(f"{label}: {num_patients=}, {num_pos=} ({perc_pos}%), {num_measurements=}, {perc_measurements}, {IQR=}")
        
    except:
        continue
        
    

anemia_lab_sql: num_patients=43682, num_pos=2954 (6.8%), num_measurements=36381, 0.833, IQR=array([ 84., 104., 121.])
hyperkalemia_lab_sql: num_patients=43891, num_pos=375 (0.9%), num_measurements=29843, 0.68, IQR=array([4.2, 4.5, 5. ])
hypoglycemia_lab_sql: num_patients=43983, num_pos=719 (1.6%), num_measurements=29695, 0.675, IQR=array([4.5, 5.1, 5.8])
hyponatremia_lab_sql: num_patients=43950, num_pos=385 (0.9%), num_measurements=29727, 0.676, IQR=array([133., 136., 139.])
long_los_sql: num_patients=44055, num_pos=12215 (27.7%), num_measurements='N/A', N/A, IQR='N/A'
mortality_sql: num_patients=44055, num_pos=1599 (3.6%), num_measurements='N/A', N/A, IQR='N/A'
readmission_sql: num_patients=44042, num_pos=259 (0.6%), num_measurements='N/A', N/A, IQR='N/A'
thrombocytopenia_lab_sql: num_patients=43912, num_pos=1342 (3.1%), num_measurements=36789, 0.838, IQR=array([135., 189., 245.])


In [23]:
import dask.dataframe as dd

person = dd.read_parquet("/hpf/projects/lsung/phi/data/mimic_omop_parquet/person")[
    ["person_id", "year_of_birth", "race_concept_id"]
].compute()

concept=dd.read_parquet("/hpf/projects/lsung/phi/data/mimic_omop_parquet/concept")[
    ["concept_id", "concept_name"]
].compute()

In [24]:
person = person.merge(
    concept,
    how = "left",
    left_on = "race_concept_id",
    right_on = "concept_id"
).rename(columns={"concept_name": "race"})

In [26]:
df_cohort_m = df_cohort.merge(
    person[["person_id", "year_of_birth", "race"]],
    left_on="person_id",
    right_on="person_id"
)

In [27]:
p1, p2, p3 = (pd.to_datetime(df_cohort_m["index_datetime"]).dt.year - df_cohort_m["year_of_birth"]).quantile([0.25, 0.5, 0.75])

In [28]:
print(p1, p2, p3)

35.0 56.0 71.0


In [29]:
df_cohort_m["race"].value_counts()

race
White                               27402
Black or African American            5338
UNKNOWN                              3658
OTHER                                2630
No matching concept                  2328
Asian                                1799
UNABLE TO OBTAIN                      792
American Indian or Alaska Native      108
Name: count, dtype: int64