In [1]:
# !pip install psycopg2-binary

In [46]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np

In [None]:
sql_engine = create_engine('postgresql://postgres:postgres@10.211.116.164:5432/mimic')

# 1 MIMIC-III Readmission based on [Health system-scale language models are all-purpose prediction engines](https://www.nature.com/articles/s41586-023-06160-y), Nature 2023

This is an open dataset for an intensive care unit (ICU) EHR released by MIT and Boston Beth Israel Medical Center29. We collected a set of 52,726 discharge notes and created a 30-day all-cause readmission label by checking whether there was any subsequent encounter within 30 days. The readmission rate was 6%. We split the data into training, validation and test sets in a 8:1:1 ratio.

In [38]:
# Get length of stay from the icustays table
query = \
"""
SELECT *
FROM mimiciii.admissions ;
"""

mimic_admissions = pd.read_sql_query(query,con=sql_engine)

In [40]:
mimic_admissions.shape

(58976, 19)

In [67]:
# mimic_admissions.sample(10)
types = pd.Series(mimic_admissions.admission_type.values,index=mimic_admissions.hadm_id).to_dict()

In [26]:
mimic_admissions['admission_type'].value_counts()

admission_type
EMERGENCY    42071
NEWBORN       7863
ELECTIVE      7706
URGENT        1336
Name: count, dtype: int64

In [72]:
# mimic_admissions = mimic_admissions[mimic_admissions['admission_type']!='NEWBORN']

In [28]:
mimic_admissions.shape

(51113, 19)

In [70]:
# Get length of stay from the icustays table
query = \
"""
SELECT *
FROM mimiciii.noteevents where category = 'Discharge summary';
"""

notes_df = pd.read_sql_query(query,con=sql_engine)

In [74]:
dischs = pd.Series(notes_df.text.values,index=notes_df.hadm_id).to_dict()

In [75]:
# notes_df.head()

# use data https://github.com/YaronBlinder/MIMIC-III_readmission

In [81]:
# Get length of stay from the icustays table
query = \
"""
SELECT *
FROM mimiciii.readmissions_data;
"""

data = pd.read_sql_query(query,con=sql_engine)

In [82]:
# list(data)

In [83]:
# calculate time delta between subsequent readmissions of the same patient 
data['readmit_dt'] = np.zeros(data.shape[0])
data['next_readmit_dt'] = np.zeros(data.shape[0])
data['readmit_last_careunit'] = None

for idx in np.arange(1,data.shape[0]):
    if data.subject_id[idx] == data.subject_id[idx - 1]:     
        prev_disch = data.dischtime[idx-1]
        curr_adm = data.admittime[idx]
        dt = curr_adm - prev_disch
        dt_hrs_calc = np.round(dt.value/3600.0/1e9,2)

#         data.set_value(idx,'adm_num',data['adm_num'][idx-1] + 1) 
        data.at[idx,'readmit_dt'] = dt_hrs_calc
        data.at[idx-1,'next_readmit_dt'] = dt_hrs_calc
        data.at[idx,'readmit_last_careunit'] = data['last_careunit'][idx-1] 

In [84]:
# list(data)

In [85]:
data = data.drop(['urine_min','urine_mean','urine_max'], axis = 1) #Too noisy
data = data[data.readmit_dt >= 0] #Ignore cases where readmit_dt < 0, which result from duplicate records. 
data = data[(data.deathtime.isnull())] #Remove cases where the patient died during stay
data = data.drop(['deathtime'], axis = 1) # Important to drop before dropna otherwise most of the data is lost
data = data.dropna(subset=data.keys()[:-1]).reset_index(drop = True) # Ignore NaN values in readmit_last_careunit


In [86]:
# Define threshold in hours
threshold = 30*24

In [87]:
# Define label column based on threshold
data['future_readmit'] = None
data['future_readmit'] = ['No' if dt == 0.0 else 'Yes' if dt<=threshold else 'No' for dt in data.next_readmit_dt]

In [88]:
print ('Value counts:')
print (data.future_readmit.value_counts())
print ('\nValue proportions:')
print (data.future_readmit.value_counts()/data.shape[0])

Value counts:
future_readmit
No     17988
Yes     2934
Name: count, dtype: int64

Value proportions:
future_readmit
No     0.859765
Yes    0.140235
Name: count, dtype: float64


In [89]:
# data.head()

In [90]:
set(data['hadm_id']) - set(types.keys())

set()

In [91]:
# some admissions do not have discharge summary
# set(data['hadm_id']) - set(dischs.keys())

In [92]:
data['admission_type'] = data['hadm_id'].map(types)

In [93]:
data['discharge_summary'] = data['hadm_id'].map(dischs)

In [94]:
data.shape

(20922, 45)

In [95]:
data.to_csv('mimic_readmits_clean.csv', index=False)

# 2. Fix bugs for [Clinical Outcome Prediction from Admission Notes using Self-Supervised Knowledge Integration](https://www.aclweb.org/anthology/2021.eacl-main.75/), EACL 2021

In [191]:
import re

text = """service: cardiothoracic

allergies:
amlodipine

attending:[**last name (namepattern1) 1561**]
chief complaint:
81 yo f smoker w/ copd, severe tbm, s/p tracheobronchoplasty [**5-5**]
s/p perc trach [**5-13**]

major surgical or invasive procedure:
bronchoscopy 3/31,4/2,3,[**6-12**], [**5-17**], [**5-19**]
s/p trachealplasty [**5-5**]
percutaneous tracheostomy [**5-13**] after failed extubation
down size trach on [**5-25**] to size 6 cuffless"""

# pattern = re.compile(r"(?i)chief complaint:(.+?)\n\n", re.DOTALL)
pattern = re.compile(r"(?i)chief complaint:(.+?)\n\n[^(\\|\d|\.)]+?:", re.DOTALL)

match = pattern.search(text)
if match:
    chief_complaint = match.group(1).strip()
    print(chief_complaint)
else:
    print("Chief complaint not found.")


81 yo f smoker w/ copd, severe tbm, s/p tracheobronchoplasty [**5-5**]
s/p perc trach [**5-13**]


In [189]:
python tasks/los/los.py --mimic_dir /home/jovyan/work/prj_ppn_roberts_grp/mimic-iii/mimic-iii-clinical-database-1.4 
--save_dir /home/jovyan/work/prj_ppn_roberts_grp/clinical-outcome-prediction/mimic-iii-outcome-data --admission_only True