In [1]:
from datetime import datetime
import pickle
import pandas as pd
import os
import sys
sys.path.append("../")
from tqdm import tqdm

In [2]:
! ls ../../../../

bin   data  etc   lib	 media	mnt  opt   root  sbin	   srv	tmp  var
boot  dev   home  lib64  misc	net  proc  run	 software  sys	usr


In [3]:
from pyhealth.datasets import MIMIC3BaseDataset, MIMIC4BaseDataset, eICUBaseDataset, OMOPBaseDataset
base_dataset = MIMIC3BaseDataset(root="../../../../srv/local/data/physionet.org/files/mimiciii/1.4")

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# Data Preparation

df_diagnosis = base_dataset.raw_diagnosis()
df_admission = base_dataset.raw_admissions()
REMOVE_DIAGNOSIS = ~((df_admission['DIAGNOSIS'] == 'ORGAN DONOR ACCOUNT') | (df_admission['DIAGNOSIS'] == 'ORGAN DONOR') | \
                   (df_admission['DIAGNOSIS'] == 'DONOR ACCOUNT'))
df = df_admission[REMOVE_DIAGNOSIS]

patient_data = {}
patient_id = set(df['SUBJECT_ID'])

data = df_diagnosis['ICD9_CODE'].values
    
def code2idx(data):
    data_set = set()
    for i in range(len(data)):
        data_set.add(data[i])
    voc_size = len(data_set)
    data_map = {}
    for i in range(voc_size):
        data_map[data_set.pop()] = i
    
    return voc_size, data_map

def get_idx(list_, datamap):
    res = []
    for i in range(len(list_)):
        res.append(datamap[list_[i]])
    return res

def convert_to_med2vec(patient_data):
    data = []
    for k, vv in patient_data.items():
        for v in vv:
            data.append(v[0])
        data.append([-1])
    return data

voc_size, datamap = code2idx(data)

for pid in tqdm(patient_id):
    pid_df = df[df['SUBJECT_ID'] == pid]
    if (len(pid_df) < 2):
        continue
    adm_list = pid_df[['HADM_ID', 'ADMITTIME', 'DEATHTIME']] # add DISCHATIME ?
    patient_data[pid] = []
    for i, r in adm_list.iterrows():
        admid = r['HADM_ID']
        admitime = r['ADMITTIME']
        icd9_raw = df_diagnosis[df_diagnosis['HADM_ID'] == admid]['ICD9_CODE'].values
        icd9_raw = list(set(icd9_raw))
        icd9 = get_idx(icd9_raw, datamap)
        mortality = r['DEATHTIME'] == r['DEATHTIME'] # check not nan
        admtime = datetime.strptime(r['ADMITTIME'], '%Y-%m-%d %H:%M:%S')
        tup = (icd9, admtime, mortality)
        patient_data[pid].append(tup)

patient_data_ = convert_to_med2vec(patient_data)

100%|███████████████████████████████████████████████████████████████████████| 46518/46518 [00:21<00:00, 2211.79it/s]


In [16]:
patient_data_

[[6426, 5834, 2589, 5874],
 [4808, 6933, 76, 5874, 1207, 4749, 695, 1413],
 [-1],
 [4436,
  1744,
  6450,
  5688,
  1906,
  6933,
  6670,
  1417,
  5690,
  1664,
  1319,
  1609,
  6202,
  3732,
  1929,
  5375,
  6972,
  3778],
 [5988,
  1417,
  2437,
  6037,
  6450,
  5910,
  6389,
  5375,
  6972,
  5888,
  2204,
  2382,
  2178,
  5530,
  4151,
  1887,
  76,
  6670,
  5759,
  4559,
  1929],
 [-1],
 [1200, 3174, 5874, 6594, 2498, 1929, 3240, 6428],
 [6356, 5688, 5874, 5507, 1163, 2022, 5178, 6951, 3240, 6428],
 [-1],
 [1870, 6450, 6620, 1906, 1319, 953, 3820, 1929],
 [1135, 5910, 1906, 6434, 1929, 1551, 2974, 5007],
 [-1],
 [5570, 4701, 6594, 4665, 5467, 1929, 1009, 3240, 2498],
 [3240, 5570, 3738, 3845, 5352, 4665, 6594, 5467, 3482, 2695, 1929, 1163],
 [3240, 5965, 5545, 2637, 4665, 6594, 5467, 1951, 2561, 2695, 1071, 3289],
 [-1],
 [5195, 2577, 6609, 6623, 5152, 3740, 3020, 3778, 2076],
 [5640,
  5552,
  3055,
  6234,
  6450,
  605,
  5948,
  3303,
  212,
  1995,
  6836,
  3389,
  446

In [5]:
print(icd9_raw)

['56400', '2875', '53081', '4280', '78551', '7994', 'V422', '78720', '59689', '5849', '4254', '42823', '78820', '60001', 'V5861', '42731']
