# MIMIC 4 data - dataset construction admissions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
import numpy as np

In [3]:
fn = r'D:\Dataset\mimic-iv-3.0\mimic-iv-3.0\hosp\admissions.csv.gz'
adm = pd.read_csv(fn, compression='gzip')
adm.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,,URGENT,P49AFC,TRANSFER FROM HOSPITAL,HOME,Medicaid,English,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,,EW EMER.,P784FA,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,,EW EMER.,P19UTS,EMERGENCY ROOM,HOSPICE,Medicaid,English,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,,EW EMER.,P06OTX,EMERGENCY ROOM,HOME,Medicaid,English,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,,EU OBSERVATION,P39NWO,EMERGENCY ROOM,,,English,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


In [None]:
#keep only patients present in patients data
patients_df=pd.read_csv(r'D:\Dataset\mimic-iv-3.0\mimic-iv-3.0\hosp\patients.csv.gz')
patients_df[["subject_id","anchor_age"]].head()
adm_dob=pd.merge(patients_df[["subject_id","anchor_age"]],adm,on="subject_id")

df=adm.groupby("subject_id")["hadm_id"].nunique()
subj_ids=list(df[df==1].index)
adm_1=adm_dob.loc[adm_dob["subject_id"].isin(subj_ids)]
print("Number of patients remaining in the dataframe: ")
print(len(adm_1.index))

Number of patients remaining in the dataframe: 
123289


In [6]:
# time of stay in ICU
adm_1=adm_1.copy()
adm_1['admittime']=pd.to_datetime(adm_1["admittime"], format='%Y-%m-%d %H:%M:%S')
adm_1['dischtime']=pd.to_datetime(adm_1["dischtime"], format='%Y-%m-%d %H:%M:%S')

adm_1["elapsed_time"]=adm_1["dischtime"]-adm_1["admittime"]
adm_1.head()
adm_1["elapsed_days"]=adm_1["elapsed_time"].dt.days 

adm_2=adm_1.loc[(adm_1["elapsed_days"]<30) & (adm_1["elapsed_days"]>2)]
print("Number of patients remaining in the dataframe: ")
print(len(adm_2.index))

Number of patients remaining in the dataframe: 
50431


In [7]:
# only patients older than 15
adm_2_15=adm_2.loc[adm_2["anchor_age"]>15].copy()
print("Number of patients remaining in the dataframe: ")
print(len(adm_2_15.index))

Number of patients remaining in the dataframe: 
50431


In [9]:
item_id=pd.read_csv(r"D:\Dataset\mimic-iv-3.0\mimic-iv-3.0\icu\d_items.csv.gz")
item_id_1=item_id[["itemid","label"]]
item_id_1.head()

Unnamed: 0,itemid,label
0,220001,Problem List
1,220003,ICU Admission date
2,220045,Heart Rate
3,220046,Heart rate Alarm - High
4,220047,Heart Rate Alarm - Low


In [8]:
fn = r'D:\Dataset\mimic-iv-3.0\mimic-iv-3.0\icu/chartevents.csv.gz'
# this file is huge, we need to read in the data in chunks
# chartevents = pd.read_csv(fn, compression='gzip')

# workaround:
ids = np.array([])
for chunk in pd.read_csv(fn, chunksize=1000000):
    ids = np.append(ids, chunk['hadm_id'].unique())
    ids = np.unique(ids)

KeyboardInterrupt: 

In [None]:
adm_2_15_chart=adm_2_15.loc[adm_2_15["hadm_id"].isin(ids)].copy()
print("Number of patients remaining in the dataframe: ")
print(len(adm_2_15_chart.index))

In [None]:
adm_2_15_chart.to_csv("/path/processed/admissions_processed.csv")