In [1]:
import pickle
import os, sys
import pandas as pd
from tqdm import tqdm
os.chdir('/data1/xuzhang/project/GKI-ICD')
sys.path[0]='../'
!pwd
from src.utils import write_pickle, read_pickle

/data1/xuzhang/project/GKI-ICD


Get Raw Data from MIMIC-IV Dataset

In [3]:
note_events = pd.read_csv("data/raw_data/mimic4/discharge.csv")

In [4]:
hadm2subject = {}
for i,row in note_events.iterrows():
    hadm = row['hadm_id']
    subject_id = row['subject_id']
    if hadm not in hadm2subject:
        hadm2subject[hadm] = subject_id

Get Official Splits

HADM_ID

In [5]:
hadm_ids = []
for split in ["train", "dev", "test"]:
    old_df = pd.read_csv(f"data/splits/mimic4_icd9/{split}_full_hadm_ids.csv",header=None)
    hadm_ids.extend(old_df[0])
    # subject_id.extend(old_df["SUBJECT_ID"])
    print(split, len(old_df))

train 188533
dev 7110
test 13709


In [6]:
len(hadm_ids), len(note_events)

(209352, 331793)

Report for each hadm_id

In [7]:
idx2report = {hadm_id: [] for hadm_id in hadm_ids}
for i, row in tqdm(note_events.iterrows()):
    hadm_id, note_type, note_seq, text = row["hadm_id"], row["note_type"], row["note_seq"], row["text"]
    if hadm_id in hadm_ids and note_type == "DS":
        idx2report[hadm_id].append(text)

331793it [05:04, 1090.79it/s]


ICD Code

In [8]:
def reformat(code, is_diag):
    """
    Put a period in the right place because the MIMIC-3 data files exclude them.
    Generally, procedure codes have dots after the first two digits,
    while diagnosis codes have dots after the first three digits.
    """
    code = "".join(code.split("."))
    if is_diag:
        if code.startswith("E"):
            if len(code) > 4:
                code = code[:4] + "." + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + "." + code[3:]
    else:
        code = code[:2] + "." + code[2:]
    return code

In [10]:
dfproc = pd.read_csv(f"data/raw_data/mimic4/procedures_icd.csv", dtype={"icd_code": str})
dfdiag = pd.read_csv(f"data/raw_data/mimic4/diagnoses_icd.csv", dtype={"icd_code": str})

ICD9

In [11]:
dfproc9 = dfproc[dfproc["icd_version"] == 9]
dfdiag9 = dfdiag[dfdiag["icd_version"] == 9]

In [17]:
dfdiag9.loc[:,"absolute_code"] = dfdiag9.apply(
    lambda row: str(reformat(str(row["icd_code"]), True)), axis=1
)
dfproc9.loc[:,"absolute_code"] = dfproc9.apply(
    lambda row: str(reformat(str(row["icd_code"]), False)), axis=1
)

In [18]:
dfcodes9 = pd.concat([dfdiag9, dfproc9])

In [19]:
hadm2icd = {hadm_id: [] for hadm_id in hadm_ids}
for i, row in tqdm(dfcodes9.iterrows()):
    hadm_id, icd_code = row["hadm_id"], row["absolute_code"]
    if hadm_id in hadm2icd:
        hadm2icd[hadm_id].append(icd_code)

3377950it [01:38, 34193.92it/s]


MIMIC-IV ICD9 Full

In [20]:
for split in ["train", "dev", "test"]:
    split_df = pd.read_csv(
        f"data/splits/mimic4_icd9/{split}_full_hadm_ids.csv", header=None
    )
    split_ids = split_df[0].tolist()
    new_df = pd.DataFrame(columns=["subject_id","hadm_id","text","label"])
    samples = []
    for hadm_id in split_ids:
        reports = idx2report[hadm_id]
        text = "\n".join(reports)
        label = hadm2icd[hadm_id]
        subject_id = hadm2subject[hadm_id]

        # pickle: Dict
        sample = {}
        sample["subject_id"] = subject_id
        sample["hadm_id"] = hadm_id
        sample["text"] = text
        sample["label"] = label
        samples.append(sample)
    write_pickle(samples, f"data/mimic4_icd9/{split}.pkl")

MIMIC-IV ICD9_50

In [21]:
top50_codes = pd.read_csv(
    "data/splits/mimic4_icd9/top50_icd9_code_list.txt", header=None
)

In [22]:
for split in ["train", "dev", "test"]:
    split_df = pd.read_csv(
        f"data/splits/mimic4_icd9/{split}_50_hadm_ids.csv", header=None
    )
    split_ids = split_df[0].tolist()
    samples = []
    for hadm_id in split_ids:
        reports = idx2report[hadm_id]
        text = "\n".join(reports)
        label = hadm2icd[hadm_id]
        label = [
            code for code in label if code in top50_codes[0].tolist()
        ]
        subject_id = hadm2subject[hadm_id]
        
        # pickle: Dict
        sample = {}
        sample["subject_id"] = subject_id
        sample["hadm_id"] = hadm_id
        sample["text"] = text
        sample["label"] = label
        samples.append(sample)
    write_pickle(samples, f"data/mimic4_icd9_50/{split}.pkl")

ICD-10

In [23]:
dfproc10 = dfproc[dfproc["icd_version"] == 10]
dfdiag10 = dfdiag[dfdiag["icd_version"] == 10]
dfcodes10 = pd.concat([dfdiag10, dfproc10])

In [32]:
dfcodes10

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,chartdate
40,10000084,23052089,1,G3183,10,
41,10000084,23052089,2,F0280,10,
42,10000084,23052089,3,R441,10,
43,10000084,23052089,4,R296,10,
44,10000084,23052089,5,E785,10,
...,...,...,...,...,...,...
859641,19999828,29734428,1,0HR7X74,10,2147-07-27
859642,19999828,29734428,2,0HBJXZZ,10,2147-07-27
859643,19999828,29734428,3,0HBHXZZ,10,2147-07-27
859644,19999828,29734428,4,02HV33Z,10,2147-07-18


In [34]:
hadm2icd10 = {hadm_id: [] for hadm_id in hadm_ids}
for i, row in tqdm(dfcodes10.iterrows()):
    hadm_id, icd_code = row["hadm_id"], row["icd_code"]
    if hadm_id in hadm2icd10:
        hadm2icd10[hadm_id].append(icd_code)

3846225it [01:53, 33843.85it/s]


MIMIC-IV ICD-10

In [25]:
hadm_ids = []
for split in ["train", "dev", "test"]:
    split_df = pd.read_csv(
        f"data/splits/mimic4_icd10/{split}_full_hadm_ids.csv", header=None
    )
    hadm_ids.extend(split_df[0])
    print(split, len(split_df))

train 110442
dev 4017
test 7851


In [26]:
idx2report = {hadm_id: [] for hadm_id in hadm_ids}
hadm2subject = {}
for i, row in tqdm(note_events.iterrows()):
    subject_id, hadm_id, note_type, note_seq, text = row["subject_id"], row["hadm_id"], row["note_type"], row["note_seq"], row["text"]
    if hadm_id in hadm_ids and note_type == "DS":
        idx2report[hadm_id].append(text)
        if hadm_id not in hadm2subject:
            hadm2subject[hadm_id] = subject_id

331793it [03:39, 1514.11it/s]


In [36]:

for split in ["train", "dev", "test"]:
    split_df = pd.read_csv(
        f"data/splits/mimic4_icd10/{split}_full_hadm_ids.csv", header=None
    )
    split_ids = split_df[0].tolist()

    samples = []
    for hadm_id in split_ids:
        if hadm_id == 23024122:
            continue
        reports = idx2report[hadm_id]
        text = "\n".join(reports)
        label = hadm2icd10[hadm_id]
        subject_id = hadm2subject[hadm_id]

        # pickle: Dict
        sample = {}
        sample["subject_id"] = subject_id
        sample["hadm_id"] = hadm_id
        sample["text"] = text
        sample["label"] = label
        samples.append(sample)

    write_pickle(samples, f"data/mimic4_icd10/{split}.pkl")

MIMIC-IV ICD10 Top50

In [37]:
top50_codes = pd.read_csv(
    "data/splits/mimic4_icd10/top50_icd10_code_list.txt", header=None
)

In [38]:
for split in ["train", "dev", "test"]:
    split_df = pd.read_csv(
        f"data/splits/mimic4_icd10/{split}_50_hadm_ids.csv", header=None
    )
    split_ids = split_df[0].tolist()
    new_df = pd.DataFrame(columns=["subject_id", "hadm_id", "text", "label"])
    samples = []
    for hadm_id in split_ids:
        if hadm_id == 23024122:
            continue
        reports = idx2report[hadm_id]
        text = "\n".join(reports)
        label = hadm2icd10[hadm_id]
        label = [code for code in label if code in top50_codes[0].tolist()]
        subject_id = hadm2subject[hadm_id]

        # pickle: Dict
        sample = {}
        sample["subject_id"] = subject_id
        sample["hadm_id"] = hadm_id
        sample["text"] = text
        sample["label"] = label
        samples.append(sample)
    new_df.to_csv(f"data/mimic4_icd10_50/{split}.csv", index=False)
    write_pickle(samples, f"data/mimic4_icd10_50/{split}.pkl")

Prepare Descriptions for ICD-9 Codes & ICD-10 Codes in MIMIC-IV

ICD-10

In [39]:
icd10_codes = set()
for split in ["train", "dev", "test"]:
    with open(f"data/mimic4_icd10/{split}.pkl", "rb") as f:
        samples = pickle.load(f)
    for i, sample in enumerate(samples):
        label = sample["label"]
        icd10_codes.update(label)
icd10_codes = sorted(icd10_codes)
len(icd10_codes)

26096

In [40]:
icd_diag = pd.read_csv("data/icd/d_icd_diagnoses.csv")
icd10_cm = icd_diag[icd_diag["icd_version"] == 10]

In [41]:
icd_proc = pd.read_csv("data/icd/d_icd_procedures.csv")
icd10_pcs = icd_proc[icd_proc["icd_version"] == 10]

In [42]:
icd10 = pd.concat([icd10_cm, icd10_pcs])
icd10_desc = dict(zip(icd10["icd_code"], icd10["long_title"]))
len(icd10_desc)

179918

In [43]:
mimic4_icd10_desc = pd.DataFrame(
    columns=["icd_code", "long_title"],
    data=[(code, icd10_desc[code]) for code in icd10_codes],
)
mimic4_icd10_desc.to_csv(
    "data/mimic4_icd10/codes.csv", index=False, sep='\t'
)
mimic4_icd10_desc

Unnamed: 0,icd_code,long_title
0,00160J2,Bypass Cerebral Ventricle to Atrium with Synth...
1,00160J6,Bypass Cerebral Ventricle to Peritoneal Cavity...
2,00163J4,Bypass Cerebral Ventricle to Pleural Cavity wi...
3,00163J6,Bypass Cerebral Ventricle to Peritoneal Cavity...
4,00164J6,Bypass Cerebral Ventricle to Peritoneal Cavity...
...,...,...
26091,Z9911,Dependence on respirator [ventilator] status
26092,Z992,Dependence on renal dialysis
26093,Z993,Dependence on wheelchair
26094,Z9981,Dependence on supplemental oxygen


ICD-9

In [44]:
icd9_codes = set()
for split in ["train", "dev", "test"]:
    with open(f"data/mimic4_icd9/{split}.pkl", "rb") as f:
        samples = pickle.load(f)
    for i, sample in enumerate(samples):
        label = sample["label"]
        icd9_codes.update(label)
icd9_codes = sorted(icd9_codes)
len(icd9_codes)

11331

In [45]:
icd_diag = pd.read_csv("data/icd/d_icd_diagnoses.csv")
icd9_diag = icd_diag[icd_diag["icd_version"] == 9]
icd9_diag.loc[:, "icd_code"] = icd9_diag["icd_code"].apply(
    lambda x: str(reformat(str(x), True))
)

In [46]:
icd_proc = pd.read_csv("data/icd/d_icd_procedures.csv")
icd9_proc = icd_proc[icd_proc["icd_version"] == 9]
icd9_proc.loc[:, "icd_code"] = icd9_proc["icd_code"].apply(
    lambda x: str(reformat(str(x), False))
)
icd9 = pd.concat([icd9_diag, icd9_proc])
icd9_desc = dict(zip(icd9["icd_code"], icd9["long_title"]))
len(icd9_desc)

18554

In [47]:
mimic4_icd9_desc = pd.DataFrame(
    columns=["icd_code", "long_title"],
    data=[(code, icd9_desc[code]) for code in icd9_codes],
)
mimic4_icd9_desc.to_csv(
    "data/mimic4_icd9/codes.csv", index=False, sep='\t'
)
mimic4_icd9_desc

Unnamed: 0,icd_code,long_title
0,00.09,Other therapeutic ultrasound
1,00.10,Implantation of chemotherapeutic agent
2,00.11,Infusion of drotrecogin alfa (activated)
3,00.12,Administration of inhaled nitric oxide
4,00.13,Injection or infusion of nesiritide
...,...,...
11326,V90.9,"Retained foreign body, unspecified material"
11327,V91.01,"Twin gestation, monochorionic/monoamniotic (on..."
11328,V91.02,"Twin gestation, monochorionic/diamniotic (one ..."
11329,V91.03,"Twin gestation, dichorionic/diamniotic (two pl..."


ICD-10-50

In [48]:
icd10_top50_codes = set()
for split in ["train", "dev", "test"]:
    with open(f"data/mimic4_icd10_50/{split}.pkl", "rb") as f:
        samples = pickle.load(f)
    for i, sample in enumerate(samples):
        label = sample["label"]
        icd10_top50_codes.update(label)
icd10_top50_codes = sorted(icd10_top50_codes)
len(icd10_top50_codes)

50

In [49]:
mimic4_icd10_top50_desc = pd.DataFrame(
    columns=["icd_code", "long_title"],
    data=[(code, icd10_desc[code]) for code in icd10_top50_codes],
)
mimic4_icd10_top50_desc.to_csv("data/mimic4_icd10_50/codes.csv", index=False, sep='\t')
mimic4_icd10_top50_desc

Unnamed: 0,icd_code,long_title
0,02HV33Z,Insertion of Infusion Device into Superior Ven...
1,D62,Acute posthemorrhagic anemia
2,D649,"Anemia, unspecified"
3,D696,"Thrombocytopenia, unspecified"
4,E039,"Hypothyroidism, unspecified"
5,E1122,Type 2 diabetes mellitus with diabetic chronic...
6,E119,Type 2 diabetes mellitus without complications
7,E669,"Obesity, unspecified"
8,E785,"Hyperlipidemia, unspecified"
9,E871,Hypo-osmolality and hyponatremia


ICD-9-50

In [50]:
icd9_top50_codes = set()
for split in ["train", "dev", "test"]:
    with open(f"data/mimic4_icd9_50/{split}.pkl", "rb") as f:
        samples = pickle.load(f)
    for i, sample in enumerate(samples):
        label = sample["label"]
        icd9_top50_codes.update(label)
icd9_top50_codes = sorted(icd9_top50_codes)
len(icd9_top50_codes)

50

In [51]:
mimic4_icd9_top50_desc = pd.DataFrame(
    columns=["icd_code", "long_title"],
    data=[(code, icd9_desc[code]) for code in icd9_top50_codes],
)
mimic4_icd9_top50_desc.to_csv("data/mimic4_icd9_50/codes.csv", index=False, sep='\t')
mimic4_icd9_top50_desc

Unnamed: 0,icd_code,long_title
0,244.9,Unspecified acquired hypothyroidism
1,250.00,Diabetes mellitus without mention of complicat...
2,272.0,Pure hypercholesterolemia
3,272.4,Other and unspecified hyperlipidemia
4,274.9,"Gout, unspecified"
5,276.1,Hyposmolality and/or hyponatremia
6,276.2,Acidosis
7,276.51,Dehydration
8,278.00,"Obesity, unspecified"
9,285.1,Acute posthemorrhagic anemia
