In [1]:
import pandas as pd
import numpy as np

In [2]:
mibi_df = pd.read_csv("../V1.1.0/MICROBIOLOGYEVENTS.csv")
d_items_df = pd.read_csv("../V1.1.0/D_ITEMS.csv")

In [3]:
# 11681 out of 13449 HADMs have at least one mibi event (defined as at least one org id)
mibi_df.drop(["ROW_ID", "ORG_NAME", "AB_NAME", "DILUTION_TEXT", "DILUTION_COMPARISON", "DILUTION_VALUE", "SPEC_TYPE_DESC", "SUBJECT_ID"], axis=1, inplace=True)

In [4]:
item_id_to_item_name = {item_id: item_name for item_id, item_name in zip(d_items_df["ITEMID"], d_items_df["LABEL"])}
mibi_df["ORG_NAME"] = mibi_df['ORG_ITEMID'].map(item_id_to_item_name)
mibi_df["AB_NAME"] = mibi_df['AB_ITEMID'].map(item_id_to_item_name)

In [5]:
# some hadm (although listed) do not provide any info on organisms --> delete
mibi_df = mibi_df[mibi_df.ORG_ITEMID.notnull()]

In [6]:
mibi_df['CHARTTIME'] = mibi_df['CHARTTIME'].str.split(' ').str.get(0)

Get an overview regarding the specimens and different pathogens found.

In [9]:
count_grp = mibi_df.groupby("ORG_NAME")["HADM_ID"].agg("count").sort_values(ascending=False)

In [10]:
neg_probes = ['No ','no ','Normal', 'without'] # checked before to not remove some important organisms

pattern = '|'.join(neg_probes)

neg_removed = count_grp.loc[~count_grp.index.str.contains(pattern, case=False)]

In [11]:
# include top 40 organisms (others were found in less than 200 hadms)
top_org_names = neg_removed.head(n=40).index.to_list()

In [12]:
count_grp.head(n=20)

ORG_NAME
Klebsiella pneumoniae                                       22366
Acinetobacter baumannii complex                             16386
Cultured for 5 days without bacteria, fungus growth         15282
Escherichia coli                                            12005
No fungus cultured                                          11421
Pseudomonas aeruginosa                                       9876
No pathogenic haemophilus cultured?                          8433
Normal flora growth (H. haemophilus, fungi not detected)     7895
Staphylococcus epidermidis                                   7817
Staphylococcus aureus                                        6237
No bacterial growth                                          5961
Streptococcus pneumoniae                                     3997
No bacteria, fungus growth                                   3664
Enterococcus faecium group D                                 3446
Stenotrophomonas maltophilia (Xanthomonas)                   3204
S

In [13]:
mibi_df = mibi_df[mibi_df["ORG_NAME"].isin(top_org_names)]

In [14]:
# note: ab_names denote names of tests performed (Is organism XY S/R/I for/against a specific antibioticum?), not the treatment itsself
mibi_df

Unnamed: 0,HADM_ID,CHARTTIME,SPEC_ITEMID,ORG_ITEMID,AB_ITEMID,INTERPRETATION,ORG_NAME,AB_NAME
0,104977,2066-10-09,LIS031294+LIS0204+LIS0320,MIC310,MIC473,R,Staphylococcus hominis,Moxifloxacin
1,104977,2066-10-09,LIS031294+LIS0204+LIS0320,MIC310,MIC470,R,Staphylococcus hominis,CLINDAMYCIN
2,104977,2066-10-09,LIS031294+LIS0204+LIS0320,MIC310,MIC1015,S,Staphylococcus hominis,Quinupristin/dalofopine
3,104977,2066-10-09,LIS031294+LIS0204+LIS0320,MIC310,MIC1215,S,Staphylococcus hominis,Tigecycline
4,104977,2066-10-09,LIS031294+LIS0204+LIS0320,MIC1703,,,Gram positive cocci (G+C),
...,...,...,...,...,...,...,...,...
183859,102751,2075-11-17,LIS031294+LIS0204+LIS035456+LIS0202,MIC108,MIC483,I,Escherichia coli,Amoxicillin/clavulanic acid
183860,102751,2075-11-17,LIS031294+LIS0204+LIS035456+LIS0202,MIC108,MIC480,R,Escherichia coli,CIPROFLOXACIN
183861,102751,2075-11-17,LIS031294+LIS0204+LIS035456+LIS0202,MIC108,MIC466,R,Escherichia coli,AMPICILLIN
183862,102751,2075-11-17,LIS031294+LIS0204+LIS035456+LIS0202,MIC108,MIC1215,S,Escherichia coli,Tigecycline


Add a flag column whether a HADM was tested positive for a specific organism or not

In [8]:
unique_species = mibi_df.groupby(["HADM_ID", "ORG_NAME"])["CHARTTIME"].unique()

In [9]:
unique_species

HADM_ID  ORG_NAME                                                
100559   Burkholderia cepacia (onion pseudomonas)                                [2065-07-27, 2065-08-03]
         Culture without pathogenic bacteria, fungus growth                                  [2065-08-25]
         No bacteria, fungus growth                                                          [2065-08-21]
         Normal flora growth (H. haemophilus, fungi not detected)                            [2065-08-12]
         Pseudomonas aeruginosa                                      [2065-08-24, 2065-08-26, 2065-08-30]
                                                                                     ...                 
114199   Normal flora growth (H. haemophilus, fungi not detected)                            [2100-02-22]
114200   No bacterial growth                                                                 [2090-02-21]
         No fungus cultured                                                           

In [10]:
grp_=mibi_df.groupby(["ORG_NAME", "HADM_ID"]).agg({"ORG_NAME":"unique", "CHARTTIME":"unique"})
grp_["CHARTTIME"] = grp_["CHARTTIME"].str[0]
grp_.drop(["ORG_NAME"], axis=1, inplace=True)

In [11]:
grp_

Unnamed: 0_level_0,Unnamed: 1_level_0,CHARTTIME
ORG_NAME,HADM_ID,Unnamed: 2_level_1
Enterococcus hirae,107304,2098-08-10
"Vibrio cholerae group O1, O139 not detected",101369,2094-06-11
"Vibrio cholerae group O1, O139 not detected",101401,2112-08-26
"Vibrio cholerae group O1, O139 not detected",101543,2081-07-05
"Vibrio cholerae group O1, O139 not detected",101947,2084-09-19
...,...,...
uncultured vibrio parahaemolyticus,108731,2116-10-19
uncultured vibrio parahaemolyticus,109447,2084-09-21
undefined bacteria,109721,2089-04-07
undefined bacteria,113035,2111-07-09


In [12]:
result_df = grp_.unstack().T.fillna(0)

In [13]:
result_df = result_df.reset_index()

In [14]:
result_df = result_df.drop("level_0", axis=1)

In [37]:
pneumonia_pathogens = result_df.filter(regex='(?i)baum|aeruginos|agalacti|aureu|HADM|entero')

In [28]:
import ehrapy as ep
adata = ep.io.read_h5ad("./adata_pneumonia_unspecified_rest_annotated.h5ad")

In [38]:
pneumonia_pathogens = pneumonia_pathogens[pneumonia_pathogens['HADM_ID'].isin(adata.obs["HADM_ID"])]

In [39]:
non_zero_counts = pneumonia_pathogens.astype(bool).sum()
print(non_zero_counts)

ORG_NAME
HADM_ID                                          239
 Enterococcus hirae                                0
ACINETOBACTER BAUMANNII                            0
Acinetobacter baumannii complex                   15
Acinetobacter baumannii-calcoaceticus complex      0
Enterobacter aerogenes                             0
Enterobacter asburiae                              0
Enterobacter cloacae complex                       1
Enterobacter kobei                                 0
Enterococcus                                       0
Enterococcus avium Group D                         0
Enterococcus durans (Group D)                      0
Enterococcus faecalis group D                      1
Enterococcus faecium group D                       6
Enterococcus gallinarum                            0
Enterococcus hirae (Group D)                       0
Enterococcus raffinosus                            0
Enterococcus: dominant growth                      8
Pseudomonas aeruginosa               

In [17]:
icu_stay_df = pd.read_csv("./temp_pp_data_files/icustay_chart_events.csv")

In [18]:
org_col_only = set(result_df.columns) - {"HADM_ID"}

In [19]:
for org in org_col_only:
    hadm_id_org = {hadm_id: org for hadm_id, org in zip(result_df["HADM_ID"], result_df[org])}
    icu_stay_df[f"{org}_positive"] = icu_stay_df['HADM_ID'].map(hadm_id_org)

In [21]:
# non exisiting hadms in micriobiology events were not tested positive
org_col_only = [org_col + "_positive" for org_col in org_col_only]
icu_stay_df[org_col_only] = icu_stay_df.loc[:,org_col_only].fillna(value=0)

# Positive sputum cultures

In [1]:
import pandas as pd
import numpy as np
mibi_df = pd.read_csv("../V1.1.0/MICROBIOLOGYEVENTS.csv")
d_items_df = pd.read_csv("../V1.1.0/D_ITEMS.csv")
# 11681 out of 13449 HADMs have at least one mibi event (defined as at least one org id)
# drop unused columns for now
mibi_df.drop(["ROW_ID", "ORG_NAME", "AB_NAME", "DILUTION_TEXT", "DILUTION_COMPARISON", "DILUTION_VALUE", "SPEC_TYPE_DESC", "SUBJECT_ID"], axis=1, inplace=True)
# some hadm (although listed) do not have any info on organisms --> delete
mibi_df = mibi_df[mibi_df.ORG_ITEMID.notnull()]

In [2]:
# sputum culture
mibi_df = mibi_df[mibi_df['SPEC_ITEMID'].str.contains("LIS0166|LIS039112")]

In [3]:
mibi_df.drop(["AB_ITEMID", "INTERPRETATION"], axis=1, inplace=True)

In [4]:
item_id_to_item_name = {item_id: item_name for item_id, item_name in zip(d_items_df["ITEMID"], d_items_df["LABEL"])}
mibi_df["ORG_NAME"] = mibi_df['ORG_ITEMID'].map(item_id_to_item_name)

In [5]:
neg_probes = ['No ','no ','Normal', 'without'] # checked before to not remove some important organisms

pattern = '|'.join(neg_probes)

mibi_df = mibi_df[~mibi_df['ORG_NAME'].str.contains('No |no |Normal|without')]

In [6]:
mibi_df.drop(["ORG_ITEMID", "SPEC_ITEMID"], axis=1, inplace=True)

In [7]:
pathogens = set(pd.unique(mibi_df["ORG_NAME"]))

fungi_pathogens = ["Trichosporon","Saccharomyces cerevisiae", "Lodderomyces elongisporus","Pichia norvegensis"]
fungi_matches = ["candida", "aspergillus", "fungus"]
bacterial_pathogens = []
for pathogen in pathogens:
    if any([fungi in pathogen.lower() for fungi in fungi_matches]):
        fungi_pathogens.append(pathogen)
    else:
        bacterial_pathogens.append(pathogen)


In [8]:
mibi_df.drop(["CHARTTIME"], axis=1, inplace=True)

In [9]:
mibi_df = mibi_df.drop_duplicates(["HADM_ID", "ORG_NAME"])

In [10]:
mibi_df["sputum_positive_bacteria"] = mibi_df["ORG_NAME"].isin(bacterial_pathogens)
mibi_df["sputum_positive_fungi"] = mibi_df["ORG_NAME"].isin(fungi_pathogens)

In [11]:
mibi_df.drop(["ORG_NAME"], axis=1, inplace=True)

In [12]:
# True -> 1, False -> 0 , sum > 0 means this pathogen type has been found in sputum in this HADM ID
pathogens_flg_df = mibi_df.groupby("HADM_ID").agg({"sputum_positive_bacteria": "sum", "sputum_positive_fungi": "sum"}).reset_index()
pathogens_flg_df["sputum_positive_bacteria"] = np.where(pathogens_flg_df["sputum_positive_bacteria"] > 0, 1, 0)
pathogens_flg_df["sputum_positive_fungi"] = np.where(pathogens_flg_df["sputum_positive_fungi"] > 0, 1, 0)

In [21]:
icu_stay_df = pd.read_csv("./temp_pp_data_files/icu_stay_mibi.csv")
remove_pathogens = []
for column in icu_stay_df.columns:
    if "_positive" in column:
        remove_pathogens.append(column)
icu_stay_df = icu_stay_df.drop(remove_pathogens, axis=1)

In [22]:
final_df = pd.merge(icu_stay_df, pathogens_flg_df, on='HADM_ID', how='left')
final_df["sputum_positive_bacteria"] = final_df["sputum_positive_bacteria"].fillna(0)
final_df["sputum_positive_fungi"] = final_df["sputum_positive_fungi"].fillna(0)

In [24]:
final_df.to_csv("./temp_pp_data_files/icu_stay_mibi_NEW.csv", index=False)