# Notebook to Convert AmsterdamUMCdb to MIMIC-III Format

Here we will be converting the AmsterdamUMCdb data files to the MIMIC-III data file format as generated by MIMIC-Code. We do this as to allow the exact same preprocessing to be applied to both MIMIC and the AmsterdamUMCdb.

In [1]:
# Dependencies
import os
import re
import random
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from fancyimpute import KNN    
from sklearn.neighbors import KNeighborsClassifier
from datetime import timedelta
from tqdm import tqdm

# Directory where AmsterdamUMCdb is kept
DATA_DIR = '../data/amsterdam-umc-db/'

if not os.path.exists(DATA_DIR):
    raise Exception('Dataset directory %s does not exist!' % DATA_DIR)

# Directory where AmsterdamUMCdb is kept
OUTPUT_DIR = '../data/amsterdam-umc-db/final/'

if not os.path.exists(DATA_DIR):
    os.makedirs(OUTPUT_DIR)
    
# Batch size for large files
BATCH_SIZE = 500

---
## Patient Cohort


In [None]:
patient_df = pd.read_csv(DATA_DIR + 'admissions.csv')
print('Num admissions:', len(patient_df.index))

# Check for DUPLICATES!
print("Number of duplicated Metavision admissions: %d" % sum(patient_df.duplicated('patienid', keep=False) == True))

# Batch patients for large files
patient_batches = [patient_list[i: i + BATCH_SIZE] for i in range(0, len(patient_list), BATCH_SIZE)] 

# List of all patient IDs
patient_list = patient_df['patientid']
patient_list

## Window_start / window_end times

In [None]:
# ICU stay IDs
icustay_id = patient_df['admissionid']

# TODO: Infection start/end?

#### Save as `cohort.csv`

In [None]:
# cohort.csv -> icustay_id, suspected_infection_time_poe (?), window_start, window_end, hospital_expire_flag
pd.DataFrame({'icustay_id': patient_df['admissionid'],
              'suspected_infection_time_poe': None, # remove
              'window_start': None,
              'window_end': None,
              'hospital_expire_flag': hospital_expire_flag}).to_csv('cohort.csv');

## Demographics

In [None]:
# icustay_ids 
icustay_id = patient_df['admissionid']

# Age
age = patient_df['agegroup']

# Gender
is_male = patient_df['gender'] == 'Man'

# Weight
weight = patient_df['weightgroup']

# Height
height = patient_df['heightgroup']

# In-hospital mortality
hospital_expire_flag = patient_df['destination'] == 'Overleden'

#### Ventilator

In [None]:
numericitems_df = pd.read_csv('numericitems.csv', usecols=['admissionid', 'itemid', 'valueid'])

# See https://github.com/AmsterdamUMC/AmsterdamUMCdb/blob/master/concepts/lifesupport/mechanical_ventilation.ipynb
# itemid: valueids
list_items = {
              9534: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],                    # Type beademing Evita 1
              6685: [1, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 20, 22]                  # Type Beademing Evita 4
              8189: [16],                                                           # Toedieningsweg O2
              12290: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],  # Ventilatie Mode (Set) - Servo-I and Servo-U ventilators
              12347: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],  # Ventilatie Mode (Set) (2) Servo-I and Servo-U ventilators
              12376: [1, 2],                                                        # Mode (Bipap Vision)
             }

# Create DataFrame corresponding to each itemid
patients_with_vent = []
for itemid, value_ids in list_items.items():
    condition = (numericitems_df['itemid'] == itemid) & (numericitems_df['valueid'].isin(value_ids))
    patients_with_vent.extend(numericitems_df[condition]['admissionid'].tolist())

print('Num patients on ventilator:', len(patients_with_vent))

In [None]:
# Ventilator
vent = patient_df['admissionid'].isin(patients_with_vent)

#### SIRS

In [None]:
# todo

#### SOFA (on-admission)

In [None]:
# todo

#### Save as `demographics_cohort.csv`

In [None]:
# demographics_cohort.csv -> icustay_id, age, is_male, height, weight, vent, sofa, sirs
pd.DataFrame({'icustay_id': icustay_id,
              'age': age,
              'is_male': is_male,
              'height': height,
              'weight': weight,
              'vent': vent,
              'sofa': None,
              'window_end': None}).to_csv('demographics_cohort.csv');

## Vitals


In [None]:
numericitems_df = pd.read_csv('numericitems.csv', usecols=['admissionid', 'measuredat', 'item', 'value'])

# Heart rate
condition = numericitem_df[(numericitem_df['item'] == 'Hartfrequentie')]
heart_rate = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
heart_rate['vital_id'] = 'HeartRate'

# SysBP
condition = numericitem_df[(numericitem_df['item'] == 'ABP systolisch')]
sys_bp = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
sys_bp['vital_id'] = 'SysBP'

# DiasBP
condition = numericitem_df[(numericitem_df['item'] == 'ABP diastolisch')]
dias_bp = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
dias_bp['vital_id'] = 'DiasBP'

# MeanBP
condition = numericitem_df[(numericitem_df['item'] == 'ABP gemiddeld')]
mean_bp = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
mean_bp['vital_id'] = 'MeanBP'

# Glucose
condition = numericitem_df[(numericitem_df['item'] == 'Glucose bloed')]
glucose = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
glucose['vital_id'] = 'Glucose'

# SpO2
condition = numericitem_df[(numericitem_df['item'] == 'O2-Saturatie (bloed)')]
spo2 = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
spo2['vital_id'] = 'SpO2'

# TempC
condition = numericitem_df[(numericitem_df['item'] == 'Temp bloed')]
temp_c = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
temp_c['vital_id'] = 'TempC'

# RespRate
condition = numericitem_df[(numericitem_df['item'] == 'Adem Frequentie Monitor')]
resp_rate = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
resp_rate['vital_id'] = 'RespRate'

#### Save as `vitals_cohort.csv`

In [None]:
# vitals_cohort.csv -> icustay_id, charttime, vital_id, valuenum
vitals_df = pd.concat([heart_rate, sys_bp, dias_bp, mean_bp, glucose, spo2, temp_c, resp_rate])
vitals_df = vitals_df.rename(columns={'admissionid': 'icustay_id', 
                                      'measuredat': 'charttime', # to MIMIC namespace
                                      'value': 'valuenum'})
vitals_df.to_csv('vitals_cohort.csv')
del vitals_df

## Lab Results

['CALCIUM', 'ION_CALCIUM', 'ASAT', 'PTT', 'POTASSIUM', 'PT', 'PLATELET', 'ANION GAP', 'PAO2', 'ALAT', 'WBC', 'BILIRUBIN', 'SODIUM', 'CHLORIDE', 'MAGNESIUM', 'LACTATE', 'PACO2', 'GLUCOSE', 'CREATININE', 'BICARBONATE', 'BUN', 'PH', 'ALBUMIN', 'BANDS', 'HEMOGLOBIN', 'BaseExcess']

In [None]:
# Calcium
condition = numericitem_df[(numericitem_df['item'].isin(['Calcium', 'Calcium totaal (bloed)']))]
calcium = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
calcium['lab_id'] = 'Calcium'

# Ion Calcium 
# Todo: Missing?

# ASAT
condition = numericitem_df[(numericitem_df['item'].isin(['ASAT', 'ASAT (bloed)']))]
asat = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
asat['lab_id'] = 'ASAT'

# PTT (AmsterdamUMCdb records only APTT)
condition = numericitem_df[(numericitem_df['item'] == 'APTT (bloed)')]
ptt = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
ptt['value'] = ptt['value'] * 2  # approximate conversion rate: https://www.webmd.com/a-to-z-guides/partial-thromboplastin-time-test
ptt['lab_id'] = 'PTT'

# Potassium
condition = numericitem_df[(numericitem_df['item'].isin(['Kalium', 'Kalium (bloed)']))]
potassium = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
potassium['lab_id'] = 'ASAT'

#### Save as `labs_cohort.csv`

In [None]:
# labs_cohort.csv -> icustay_id, charttime, lab_id, valuenum
lab_df = pd.concat([calcium, asat, ptt])
lab_df = vitals_df.rename(columns={'admissionid': 'icustay_id', 
                                   'measuredat': 'charttime', # to MIMIC namespace
                                   'value': 'valuenum'})
lab_df.to_csv('labs_cohort.csv')
del lab_df

In [10]:
dct = pd.read_csv('../data/amsterdam-umc-db/dictionary.csv')
dct[(dct.item.str.contains('kalium', case=False)) & (dct.category != 'Medicatie')]

Unnamed: 0,itemid,item,item_en,vocabulary_id,vocabulary_concept_code,vocabulary_concept_name,abbreviation,categoryid,category,category_en,...,unitid,unit,ucum_code,low_normal_value,high_normal_value,expected_min_value,expected_max_value,table,count,count_validated
159,6835,Kalium,serum potassium,LOINC,2823-3,Potassium [Moles/volume] in Serum or Plasma,K,371,LAB CHEMIE,,...,97.0,mmol/l,mmol/L,36.0,52.0,,,numericitems,8612,8612.0
297,7101,Kalium in urine,,,,,Kalium-Urine,275,LAB URINE,,...,97.0,mmol/l,mmol/L,25.0,165.0,,,numericitems,60,60.0
1530,9556,Kalium Astrup,potassium - direct ion specific electrode meas...,LOINC,6298-4,Potassium [Moles/volume] in Blood,K+ Astrup,372,LAB ASTRUPS,,...,97.0,mmol/l,mmol/L,35.0,53.0,,,numericitems,20082,20082.0
1678,9927,Kalium (bloed),potassium,LOINC,2823-3,Potassium [Moles/volume] in Serum or Plasma,Kalium (bloed),490,LAB-Chem-bloed,,...,97.0,mmol/l,mmol/L,35.0,45.0,,,numericitems,220787,220787.0
1679,9929,Kalium (overig),,,,,Kalium (overig),479,LAB-Chem-diversen,,...,97.0,mmol/l,mmol/L,,,,,numericitems,23,23.0
1833,10271,Kalium (urine),,,,,Kalium (urine),480,LAB-Chem-urine,,...,97.0,mmol/l,mmol/L,,,,,numericitems,1993,1993.0
1834,10272,Kalium (verz. urine),,,,,Kalium (verz. urine),481,LAB-Chem-verz. urine,,...,97.0,mmol/l,mmol/L,,,,,numericitems,96,96.0
1869,10337,Kalium (faeces),,,,,Kalium (faeces),510,LAB-Chem-faeces,,...,97.0,mmol/l,mmol/L,,,,,numericitems,23,23.0
2864,12228,Kalium (verz. urine),,,,,Kalium (verz. urine),481,LAB-Chem-verz. urine,,...,316.0,mmol/24uur,mmol/(24.h),,,,,numericitems,127,127.0
3190,12619,TPV A - zonder Kalium,,,,,TPV A - zonder Kalium,381,INFUUS-TPV,,...,,,,,,,,drugitems,215,


## Fluid RangeSignals

## Vasopressor RangeSignals

## UrineOutput Signals