In [2]:
import pandas as pd
import json
import os
from collections import Counter
import re
from dateutil.parser import parse

import yaml
import sys
import glob
import ast

from nltk.tokenize import RegexpTokenizer
from tqdm import tqdm
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

hosp_dir = '/HDD16TB/Datasets/physionet.org/files/mimiciv/2.2/hosp/'
note_path = '/HDD16TB/Datasets/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv'
admission_path = hosp_dir + 'admissions.csv'
patients_path=hosp_dir + 'patients.csv'

In [3]:
tok = RegexpTokenizer(r'\w+')  # alphanumeric tokenization

def preprocess(doc, stopwords=None):
    global tok
    
    # If the 'stopwords' argument is not provided, set it to an empty set.
    if stopwords is None:
        stopwords = set()

    # replace URLs in the doc string with the word "url"
    doc = re.sub(r"https?:\S+", "url", doc)
    # remove newlines and tabs 
    doc = doc.replace('\n', ' ')
    doc = doc.replace('\t', ' ')
    
    # replace date
    doc = re.sub(r"(\d+)+(\-)+(\d+)+(\-)+(\d+)", "date", doc)
    # remove all serialization eg 1. 1) or 1.1
    doc = re.sub(r"(\d+)+(\.|\))+(\d+)", "", doc)
    doc = re.sub(r"(\d+)+(\.|\))", "", doc)

    doc = re.sub(r"\b(\w+)( \1\b)+", r"\1", doc)  # removing consecutive duplicate words
    doc = re.sub(r"(\b)(d|[dD])\.?(r|[rR])\.?(\b)", " ", doc)  # remove Dr abbreviation (I guess it's because the name of Dr is hidded so we just drop Dr. as well)
    doc = re.sub(r"([^A-Za-z0-9\s](\s)){2,}", " ", doc)  # remove consecutive punctuations

    
    doc = re.sub(r'\.+', '..', doc) #replaces two or more consecutive ellipsis with just two (..).
    doc = re.sub(r'!+', '!', doc) #replaces two or more consecutive ! with just one 
    doc = re.sub(r'\*+', ' ', doc) #replaces two or more consecutive * with just space
    doc = re.sub(r'_+', ' ', doc) #replaces two or more consecutive underscore_
    doc = re.sub(r',+', ',', doc) #replaces two or more consecutive commas,

    # all lowercase
    doc = doc.lower()

    doc = doc.lower()
    doc = [item.strip() for item in tok.tokenize(doc)
           if len(item.strip()) > 1 and item not in stopwords
           ]  # tokenize

    return doc

In [3]:
notes = pd.read_csv(
    note_path, dtype=str, usecols=['note_id', 'subject_id', 'hadm_id', 'charttime', 'storetime', 'text'])
notes.head()

Unnamed: 0,note_id,subject_id,hadm_id,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [4]:
notes.charttime = pd.to_datetime(notes.charttime)
notes.subject_id = notes.subject_id.apply(lambda x: x.strip())
notes.fillna('x', inplace=True)

tqdm.pandas()#sets up the tqdm progress bar for tracking the progress of the following operation
notes.text = notes.text.progress_apply(lambda x: preprocess(x))  # track progress
# filter out small documents
notes = notes[notes.text.apply(len) >= 30]#rows with text shorter than 30 characters are removed
notes.text = notes.text.apply(lambda x: ' '.join(x))#join the list of tokens back into a single string with space-separated words.

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 331794/331794 [11:10<00:00, 495.07it/s]


patients.csv

In [8]:
# load patient table
patient_set = set(notes.subject_id)
patients = pd.read_csv(patients_path, dtype=str)
patients.subject_id = patients.subject_id.apply(lambda x: x.strip())
patients = patients[patients.subject_id.isin(patient_set)]
patients.head()
# patients.dod = pd.to_datetime(patients.dod)


Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
3,10000084,M,72,2160,2017 - 2019,2161-02-13
7,10000117,F,48,2174,2008 - 2010,
9,10000248,M,34,2192,2014 - 2016,
17,10000560,F,53,2189,2008 - 2010,


In [9]:
patients.fillna('x', inplace=True) #fills missing (NaN) values in the with the character 'x'

In [10]:
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10000032,F,52,2180,2014 - 2016,2180-09-09
3,10000084,M,72,2160,2017 - 2019,2161-02-13
7,10000117,F,48,2174,2008 - 2010,x
9,10000248,M,34,2192,2014 - 2016,x
17,10000560,F,53,2189,2008 - 2010,x


In [11]:
patients['anchor_year_group'].unique()


array(['2014 - 2016', '2017 - 2019', '2008 - 2010', '2011 - 2013',
       '2020 - 2022'], dtype=object)

In [28]:
# convert to a dictionary: 
patients_dict = dict((z[0], list(z[1:])) for z in zip(
    patients.subject_id, patients.gender, patients['anchor_age'], 
    patients['anchor_year_group'], patients.dod, patients['anchor_year']
))

# add mortality label, in-hospital mortality, out-of-hospital mortality, mortality
for uid in patients_dict:
    patients_dict[uid].extend([0,0,0])
    if patients_dict[uid][3] != 'x':
        patients_dict[uid][-1] = 1
        death_year = int(patients_dict[uid][3].split('-')[0])
        if death_year > int(patients_dict[uid][4]):
            patients_dict[uid][-2] = 1  # out-hospital mortality
        else:
            patients_dict[uid][-3] = 1  # in-hospital mortality
    else:
        continue
        
    
#{'patient_id': ['Gender','Anchor age','Anchor year group','Date of death (dod)', 'Anchor year',
#           'In-hospital mortality label','Out-of-hospital mortality label','Overall mortality label']}


admissions.csv:for the ethnicity information

In [14]:
admits = pd.read_csv(admission_path, dtype=str)
admits.race = admits.race.fillna('OTHER')  # replace N/A values
admits.fillna('x', inplace=True) #fills missing (NaN) values in the with the character 'x'
admits_dict = dict((z[0], list(z[1:])) for z in zip(
    admits.subject_id, admits.race, admits['marital_status'], admits.insurance))

In [15]:
admits.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,x,URGENT,P874LG,TRANSFER FROM HOSPITAL,HOME,Other,ENGLISH,WIDOWED,WHITE,2180-05-06 19:17:00,2180-05-06 23:30:00,0
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,x,EW EMER.,P09Q6Y,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-06-26 15:54:00,2180-06-26 21:31:00,0
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,x,EW EMER.,P60CC5,EMERGENCY ROOM,HOSPICE,Medicaid,ENGLISH,WIDOWED,WHITE,2180-08-05 20:58:00,2180-08-06 01:44:00,0
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,x,EW EMER.,P30KEH,EMERGENCY ROOM,HOME,Medicaid,ENGLISH,WIDOWED,WHITE,2180-07-23 05:54:00,2180-07-23 14:00:00,0
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,x,EU OBSERVATION,P51VDL,EMERGENCY ROOM,x,Other,ENGLISH,SINGLE,WHITE,2160-03-03 21:55:00,2160-03-04 06:26:00,0


diagnoses_icd.csv

diagnoses_icd.csv: for icd

In [4]:
diagnoses=pd.read_csv(hosp_dir + 'diagnoses_icd.csv', dtype=str)
diagnoses.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10000032,22595853,1,5723,9
1,10000032,22595853,2,78959,9
2,10000032,22595853,3,5715,9
3,10000032,22595853,4,7070,9
4,10000032,22595853,5,496,9


In [7]:
print(diagnoses['icd_code'].unique())
print(len(diagnoses['icd_code'].unique()))

['5723' '78959' '5715' ... 'H353131' 'K8036' 'O359XX2']
25809


In [25]:
# record codes
dfcodes = dict()
icd9_count = []
icd10_count = []
hadm_set = set(notes.hadm_id)
with open(hosp_dir + 'diagnoses_icd.csv') as dfile:
    cols = [col.replace('"', '').strip() for col in dfile.readline().strip().split(',')]
    #cols = dfile.readline().strip() is string :"subject_id,hadm_id,seq_num,icd_code,icd_version" 
    #neefd to replace " and split using commas 
    subj_idx = cols.index('subject_id')
    hadm_idx = cols.index('hadm_id')
    icd_idx = cols.index('icd_code')
    version_idx = cols.index('icd_version')
    
    
    for line in dfile:
        line = [item.replace('"', '').strip() for item in line.strip().split(',')]
        if len(line) != len(cols):
            continue  # skip this line if it has missing value
        if line[subj_idx] not in patient_set:
            continue # skip if not in notes.subject_id
        if line[hadm_idx] not in hadm_set:
            continue # skip if not in hadm_id list

        code_id = '{0}-{1}'.format(line[subj_idx], line[hadm_idx])
        # code_id = line[hadm_idx].strip()
        if code_id not in dfcodes:
            dfcodes[code_id] = {'icd9':[], 'icd10':[]}
        if int(line[version_idx]) == 9:
            dfcodes[code_id]['icd9'].append(line[icd_idx])
            icd9_count.append(line[icd_idx])
        else:
            dfcodes[code_id]['icd10'].append(line[icd_idx])
            icd10_count.append(line[icd_idx])

print('Total icd 9:', len(icd9_count))
print('Total icd 10:', len(icd10_count))
icd9_count = Counter(icd9_count)
icd10_count = Counter(icd10_count)

Total icd 9: 2441648
Total icd 10: 1765430


In [26]:
for code_id in dfcodes:
    if dfcodes[code_id]['icd10'] and dfcodes[code_id]['icd9']:
        print(code_id, dfcodes[code_id])
        break

10360824-28333632 {'icd9': ['5409', 'V0481'], 'icd10': ['K3580', 'Z23']}


In [27]:
# loop through each note
results = dict()
total_entries = 0
for index, row in tqdm(notes.iterrows(), total=len(notes)):
    # uid = row['SUBJECT_ID'] + '-' + row['HADM_ID']
    uid = row['subject_id'].strip()
    hadm_id = row['hadm_id'].strip()
    code_id = '{0}-{1}'.format(uid, hadm_id)
    if code_id not in dfcodes:
        continue

    if uid not in results:
        u_age = int(patients_dict[uid][1])
        results[uid] = {
            'uid': uid,
            # for time effect analysis
            'time': patients_dict[uid][2],
            # calculate from the current stay and patient's DOB
            'age': u_age,
            'gender': patients_dict[uid][0],  # first value is the gender
            'ethnicity': admits_dict[uid][0],  # ethnicity,
            'maritial_status': admits_dict[uid][1],
            'insurance': admits_dict[uid][2],
            'mortality': patients_dict[uid][-1],
            'in-mortality': patients_dict[uid][-3],
            'out-mortality': patients_dict[uid][-2],
            'docs': list(),  # collect all patient notes
        }

    results[uid]['docs'].append({
        'doc_id': row['note_id'],
        'date': row['charttime'].strftime('%Y-%m-%d'),
        'hadm_id': hadm_id,
        'text': row['text'],
        'tags': dfcodes[code_id],
    })
    total_entries += 1

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 331794/331794 [00:11<00:00, 28737.95it/s]


In [29]:
with open('mimic.json', 'w') as wfile:
    for uid in tqdm(results, total=len(results)):
        for doc in results[uid]['docs']:
            entity = {
                'uid': uid, 'did': doc['doc_id'], 'text': doc['text'], 'time': results[uid]['time'],
                'gender': results[uid]['gender'], 'age': results[uid]['age'], 'insurance': results[uid]['insurance'],
                'ethnicity': results[uid]['ethnicity'], 'maritial_status': results[uid]['maritial_status'],
                'doc-date': doc['date'], 'mortality': results[uid]['mortality'], 
                'in-mortality': results[uid]['in-mortality'], 'out-mortality': results[uid]['out-mortality'],
                'icd9': doc['tags']['icd9'], 'icd10': doc['tags']['icd10'], 'hadm_id': doc['hadm_id']
            }
            if len(entity) == 0:
                continue
            wfile.write(json.dumps(entity) + '\n')  # use the mortality as the label to simplify the experiments

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 145845/145845 [00:08<00:00, 16627.70it/s]
