# Notebook to Convert AmsterdamUMCdb to MIMIC-III Format

Here we will be converting the [AmsterdamUMCdb](https://github.com/AmsterdamUMC/AmsterdamUMCdb) data files to the MIMIC-III data file format as generated by [MIMIC-Code](https://github.com/MIT-LCP/mimic-code). We do this as to allow the exact same preprocessing pipeline to be applied to both MIMIC and the AmsterdamUMCdb.

In [3]:
# Dependencies
import os
import pandas as pd
import numpy as np
from datetime import timedelta
from tqdm import tqdm

# Directory where AmsterdamUMCdb source files are kept
DATA_DIR = '../data/amsterdam-umc-db/'

if not os.path.exists(DATA_DIR):
    raise Exception('Dataset directory %s does not exist!' % DATA_DIR)

# Directory where AmsterdamUMCdb is kept
OUTPUT_DIR = '../data/amsterdam-umc-db/final/'

if not os.path.exists(DATA_DIR):
    os.makedirs(OUTPUT_DIR)
    
# Batch size for large files
BATCH_SIZE = 500

---
## Admissions


In [None]:
patient_df = pd.read_csv(DATA_DIR + 'admissions.csv')
print('Num admissions:', len(patient_df.index))

# Check for DUPLICATES!
print("Number of duplicated Metavision admissions: %d" % sum(patient_df.duplicated('patienid', keep=False) == True))

# Assign patients to batches for large files
patient_batches = [patient_list[i: i + BATCH_SIZE] for i in range(0, len(patient_list), BATCH_SIZE)] 

# List of all patient IDs
patient_list = patient_df['patientid']
patient_list

---
## Cohort: `suspected_infection_time_poe`, `window_start`, `window_end` times

In [None]:
# ICU stay IDs
icustay_id = patient_df['admissionid']

# TODO: Infection start/end?

#### Save as `cohort.csv`

In [None]:
# cohort.csv -> icustay_id, suspected_infection_time_poe (?), window_start, window_end, hospital_expire_flag
pd.DataFrame({'icustay_id': patient_df['admissionid'],
              'suspected_infection_time_poe': None, # remove
              'window_start': None,
              'window_end': None,
              'hospital_expire_flag': hospital_expire_flag}).to_csv(OUTPUT_DIR + 'cohort.csv');

---
## Demographics

In [None]:
# Age/weight/height group definitions to group centroids
age_map = {'18-39': 30,
           '40-49': 45, 
           '50-59': 55,
           '60-69': 65, 
           '70-79': 75, 
           '80+': 85}  # Arbitrary

weight_map = {'59-': 55,
              '60-69': 65, 
              '70-79': 75,
              '80-89': 85, 
              '90-99': 95, 
              '100-109': 105, 
              '110+': 115,  
              'N/A': 77.3}  # Dutch average weight
  
height_map = {'159-': 155,
              '160-169': 165, 
              '170-179': 175,
              '180-189': 185, 
              '190+': 195, 
              'N/A': 176.1} # Average height of men and women in NL

In [None]:
# icustay_ids 
icustay_id = patient_df['admissionid']

# Age
age = patient_df['agegroup'].map(age_map)

# Gender
is_male = patient_df['gender'] == 'Man'

# Weight
weight = patient_df['weightgroup'].apply(weight_map)

# Height
height = patient_df['heightgroup'].apply(height_map)

# In-hospital mortality
hospital_expire_flag = patient_df['destination'] == 'Overleden'

#### Ventilator

In [None]:
numericitems_df = pd.read_csv('numericitems.csv', usecols=['admissionid', 'itemid', 'valueid'])

# See https://github.com/AmsterdamUMC/AmsterdamUMCdb/blob/master/concepts/lifesupport/mechanical_ventilation.ipynb
# itemid: valueids
list_items = {                                                                      ## Device + settings ##
              9534: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],                    # Type beademing Evita 1
              6685: [1, 3, 5, 6, 8, 9, 10, 11, 12, 13, 14, 20, 22]                  # Type Beademing Evita 4
              8189: [16],                                                           # Toedieningsweg O2
              12290: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],  # Ventilatie Mode (Set) - Servo-I and Servo-U ventilators
              12347: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],  # Ventilatie Mode (Set) (2) Servo-I and Servo-U ventilators
              12376: [1, 2],                                                        # Mode (Bipap Vision)
             }

# Create DataFrame corresponding to each itemid
patients_with_vent = []
for itemid, value_ids in list_items.items():
    condition = (numericitems_df['itemid'] == itemid) & (numericitems_df['valueid'].isin(value_ids))
    patients_with_vent.extend(numericitems_df[condition]['admissionid'].tolist())

print('Num patients on ventilator:', len(patients_with_vent))

In [None]:
# Ventilator
vent = patient_df['admissionid'].isin(patients_with_vent)

#### SIRS

In [None]:
# todo

#### SOFA (on-admission)

In [None]:
# todo

#### Save as `demographics_cohort.csv`

In [None]:
# demographics_cohort.csv -> icustay_id, age, is_male, height, weight, vent, sofa, sirs
pd.DataFrame({'icustay_id': icustay_id,
              'age': age,
              'is_male': is_male,
              'height': height,
              'weight': weight,
              'vent': vent,
              'sofa': None,
              'window_end': None}).to_csv(OUTPUT_DIR + 'demographics_cohort.csv');

---
## Vitals


In [46]:
# Item names
vital_results = {
    'HeartRate': [6640],                                     # 'Hartfrequentie'
    'SysBP':     [6641, 6678, 8841],                         # 'ABP systolisch', 'Niet invasieve bloeddruk systolisch', 'ABP systolisch II', 
    'DiasBP':    [6643, 6680, 8842],                         # 'ABP diastolisch', 'Niet invasieve bloeddruk diastolisch', 'ABP diastolisch II'
    'MeanBP':    [6642, 6679, 8843],                         # 'ABP gemiddeld', 'Niet invasieve bloeddruk gemiddeld', 'ABP gemiddeld II'
    'Glucose':   [6833],                                     # 'Glucose Bloed'
    'SpO2':      [12311],                                    # 'O2-Saturatie (bloed)'
    'TempC':     [8658, 8659, 8662, 13058, 13059,            # 'Temp Bloed', 'Temperatuur Perifeer 2', 'Temperatuur Perifeer 1', 'Temp Rectaal', 'Temp Lies',
                 13060, 13061, 13062, 13063, 13952, 16110],  # 'Temp Axillair', 'Temp Oraal', 'Temp Oor', 'Temp Huid', 'Temp Blaas', 'Temp Oesophagus' 
    'RespRate':  [8874, 8873, 9654, 7726, 12266]             # 'Ademfrequentie Monitor', 'Ademfrequentie Evita', 'Ademfreq.'
}

In [None]:
# The gist of the measurements: Numeric items!
numericitems_df = pd.read_csv('numericitems.csv', usecols=['admissionid', 'measuredat', 'itemid', 'value'])

# Process vitals one-by-one
vital_dfs = []
for vital_id, key in lab_results.items():
    condition = numericitem_df[numericitem_df['itemid'].isin(key)]
    vital_df = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
    vital_df['vital_id'] = vital_id
    
    vital_dfs.append(vital_df)
    
vital_df = pd.concat(vital_dfs, axis=0)

#### Save as `vitals_cohort.csv`

In [None]:
# vitals_cohort.csv -> icustay_id, charttime, vital_id, valuenum
vitals_df.rename(columns={'admissionid': 'icustay_id', 
                          'measuredat': 'charttime', # to MIMIC namespace
                          'value': 'valuenum'}).to_csv(OUTPUT_DIR + 'vitals_cohort.csv')
del vitals_df

---
## Lab Results

In [None]:
dct = pd.read_csv('../data/amsterdam-umc-db/dictionary.csv')
dct[ (dct['item'].str.contains('Magnesium'))]

Unnamed: 0,itemid,item,item_en,vocabulary_id,vocabulary_concept_code,vocabulary_concept_name,abbreviation,categoryid,category,category_en,...,unitid,unit,ucum_code,low_normal_value,high_normal_value,expected_min_value,expected_max_value,table,count,count_validated
163,6839,Magnesium,,,,,Mg,371,LAB CHEMIE,,...,97.0,mmol/l,mmol/L,75.0,102.0,,,numericitems,4633,4633.0
315,7148,Magnesiumsulfaat (MgSO4),magnesium sulfate,ATC,B05XA05,magnesium sulfate,MgSO4 (Magnesiumsulfaat ),268,Medicatie,,...,,,,,,,,drugitems,8866,
316,7148,Magnesiumsulfaat (MgSO4),magnesium sulfate,ATC,B05XA05,magnesium sulfate,MgSO4 (Magnesiumsulfaat ),268,Medicatie,,...,,,,,,,,drugitems,32602,
1018,9002,Magnesiumsulfaat,magnesium sulfate syringe pump infusion,ATC,B05XA05,magnesium sulfate,Magnesiumsulfaat,268,Medicatie,,...,,,,,,,,drugitems,497,
1518,9543,Magnesiumoxide,magnesium oxide,ATC,A06AD02,magnesium oxide,Magnesiumoxide,268,Medicatie,,...,,,,,,,,drugitems,14429,
1699,9952,Magnesium (bloed),,,,,Magnesium (bloed),490,LAB-Chem-bloed,,...,97.0,mmol/l,mmol/L,7.0,1.0,,,numericitems,127345,127345.0
1700,9953,Magnesium (overig),,,,,Magnesium (overig),479,LAB-Chem-diversen,,...,97.0,mmol/l,mmol/L,,,,,numericitems,24,24.0
1856,10294,Magnesium (urine),,,,,Magnesium (urine),480,LAB-Chem-urine,,...,97.0,mmol/l,mmol/L,,,,,numericitems,62,62.0
1857,10295,Magnesium (verz. urine),,,,,Magnesium (verz. urine),481,LAB-Chem-verz. urine,,...,97.0,mmol/l,mmol/L,,,,,numericitems,7,7.0
2866,12232,Magnesium (verz. urine),,,,,Magnesium (verz. urine),481,LAB-Chem-verz. urine,,...,316.0,mmol/24uur,mmol/(24.h),,,,,numericitems,7,7.0


In [None]:
lab_results = {
    'Calcium':     [9933],                           # 'Calcium', 'Calcium totaal (bloed)'
    'IonCalcium':  [],
    'ASAT':        [11990],                          # 'ASAT (bloed)'
    'PTT':         [17982],                          # 'APTT (bloed)' --- TODO: Apply approx. conversion rate: https://www.webmd.com/a-to-z-guides/partial-thromboplastin-time-test
    'Potassium':   [9927],                           # 'Kalium (bloed)'
    'PT':          [],
    'Platelet':    [6797, 9964, 10409, 14252, 7369], # 'Thrombocyten', "Thrombo's (bloed)", "Thrombo's citr. bloed (bloed)", 'Thrombo CD61 (bloed)'
    'AnionGap':    [9559],                           # 'Anion-Gap (bloed)'
    'PaO2':        [6846, 21213],                    # 'PCO2', 'PCO2 (bloed) - kPa # TODO: NEED CONVERSION
    'ALAT':        [6800, 11978],                    # 'ALAT', 'ALAT (bloed)'
    'WBC':         [11678, 18553, 18554,             # 'Tot.WBC*10^8 (overig)', 'Tot. WBC*10^8 (HPC-A Allogeen)', 'Tot. WBC*10^8 (HPC-A Autoloog)'
                    18557, 6779, 9965],              # 'Tot. WBC*10^8 (Tcellen concentraat)', 'Leucocyten 10^9/l', "Leuco's (bloed) 10^9/l"
    'Bilirubin':   [9945, 6813],                     # 'Bilirubine (bloed)', 'Bili Totaal'
    'Sodium':      [12233, 9555, 9924, 10284],       # 'Natrium (overig)', 'Natrium Astrup', 'Natrium (bloed)', 'Na (onv.ISE) (bloed)'
    'Chloride':    [],
    'Magnesium':   [9952],                           # 'Magnesium (bloed)'
    'Lactate':     [10053],                          # 'Lactaat (bloed)'
    'PaCO2':       [21213],                          # 'PCO2 (bloed) - kPa' TODO: CONVERSION
    'Glucose':     [6833, 9947],                     # 'Glucose Bloed', 'Glucose (bloed)'
    'Creatinine':  [9941, 14216],                    # 'Kreatinine (bloed)', 'KREAT enzym. (bloed)'
    'Bicarbinate': [],
    'BUN':         [],
    'pH':          [6848],                           # 'pH (bloed)'
    'Albumin':     [9975],                           # 'Albumine (imm.) (bloed)'
    'Bands':       [],
    'Hemoglobin':  [9960],                           # 'Hb (bloed)'
    'BaseExcess':  []
}

# Find lab results one-by-one
lab_dfs = []
for lab_id, keys in lab_results.items():
    condition = numericitem_df[numericitem_df['itemid'].isin(keys)]
    lab_df = numericitems_df[condition][['admissionid', 'measuredat', 'value']]
    lab_df['lab_id'] = lab_id
    
    lab_dfs.append(lab_df)
    
lab_df = pd.concat(lab_dfs, axis=0)

#### Save as `labs_cohort.csv`

In [None]:
# labs_cohort.csv -> icustay_id, charttime, lab_id, valuenum
lab_df = lab_df.rename(columns={'admissionid': 'icustay_id', 
                                'measuredat': 'charttime', # to MIMIC namespace
                                'value': 'valuenum'}).to_csv(OUTPUT_DIR + 'labs_cohort.csv')
del lab_df

In [None]:
# dct = pd.read_csv('../data/amsterdam-umc-db/dictionary.csv')
# dct[ (dct['item'].str.contains('Dopamine'))]

---
## Urine Output

In [1]:
urine_output_ids = [
    8794,  # UrineCAD
    8796,  # UrineSupraPubis
    8798,  # UrineSpontaan
    8800,  # UrineIncontinentie
    8803,  # UrineUP
    10743, # Nefrodrain li Uit
    10745, # Nefrodrain re Uit
    19921, # UrineSplint Li
    19922  # UrineSplint Re
]

In [None]:
# Limit measurements to Urine Output IDs
urine_df = numericitems_df[numericitems_df['itemid'].isin(urine_output_ids)]

#### Save as `urineoutput_cohort.csv`

In [None]:
# urineoutput_cohort.csv -> icustay_id, charttime, valuenum
urine_df = urine_df.rename(columns={'admissionid': 'icustay_id', 
                                    'measuredat': 'charttime',
                                    'value': 'valuenum'}).to_csv(OUTPUT_DIR + 'urineoutput_cohort.csv')
del urine_df

---
## FiO2

In [None]:
# TODO

#### Save as `fio2_cohort.csv`

---
## IV Fluids

In [None]:
# Drug items contains all we need
drugitems_df = pd.read_csv('drugitems.csv', usecols=['admissionid', 'item', 'ordercategory', 'orderid', 'rate', 'rateunit', 'administered', 
                                                     'administeredunit', 'start', 'stop', 'fluidin', 'doserateperkg', 'doseunitid', 'doserateunitid'])

In [None]:
# Which medications to consider IV fluids?
iv_fluid_categories = ['2. Spuitpompen', 
                       'Infuus - Colloid', 
                       'Infuus - Crystalloid', 
                       'Injecties Circulatie/Diuretica']

# Limit order categories to IV fluids
iv_fluid_df = drugitems_df[drugitems_df['ordercategoryid'].isin(iv_fluid_categories)]

#### Save as `inputevents_mv_cohort.csv`

In [None]:
# labs_cohort.csv -> icustay_id, charttime, lab_id, valuenum
iv_fluid_df = iv_fluid_df.rename(columns={'admissionid': 'icustay_id', 
                                          'start': 'starttime',
                                          'end': 'endtime',
                                          'rateunit': 'rateuom',
                                          'administered': 'amount',
                                          'administeredunit': 'amountuom'}).to_csv(OUTPUT_DIR + 'inputevents_mv_cohort.csv')
del iv_fluid_df

---
## Vasopressors

In [None]:
vasopressors = ['Dopamine (Inotropin)', 
                'Dobutamine (Dobutrex)', 
                'Adrenaline (Epinefrine)', 
                'Noradrenaline (Norepinefrine)']

In [None]:
# Limit drugitems to vasopressors
vaso_df = drugitems_df[(drugitems_df['orderid'] == 65) & (drugitems_df['itemid'].isin(vasopressors))] # orderid = 65 -> continuous i.v. perfusor

# Add patient weight to vasopressor DataFrame
vaso_df = vaso_df.join(icustay_id.to_frame().join(weight), on='admissionid')

# Convert mcg/min to mcg/kg/min
def convert_vaso_units(row):
    if not row['doserateperkg'] and row['doseunitid'] == 11 and row['doserateunitid'] == 4: # µg/min -> µg/kg/min
        return row['dose'] / row['weight']
    if not row['doserateperkg'] and row['doseunitid'] == 10 and row['doserateunitid'] == 4: # mg/min  -> µg/kg/min
        return 1000 * row['dose'] / row['weight']
    if not row['doserateperkg'] and row['doseunitid'] == 10 and row['doserateunitid'] == 5: # mg/uur  -> µg/kg/min
        return 1000 * row['dose'] / row['weight'] / 60
    if row['doserateperkg'] and row['doseunitid'] == 11 and row['doserateunitid'] == 4:     # µg/kg/min!
        return row['dose']
    if row['doserateperkg'] and row['doseunitid'] == 11 and row['doserateunitid'] == 5:     # µg/kg/uur -> µg/kg/min
        return row['dose'] / 60
    return row['dose']
    
vaso_df['rate'] = vaso_df.transform(convert_vaso_units)

#### Save as `vasopressors_mv_cohort.csv`

In [None]:
# labs_cohort.csv -> icustay_id, charttime, lab_id, valuenum
vaso_df = vaso_df.rename(columns={'admissionid': 'icustay_id', 
                                  'start': 'starttime',
                                  'end': 'endtime',
                                  'rateunit': 'rateuom',
                                  'administered': 'amount',
                                  'administeredunit': 'amountuom'}).to_csv(OUTPUT_DIR + 'vassopressors_mv_cohort.csv')
del vaso_df