In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pickle
import regex as re
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split

In [None]:
def convert_to_icd9(dx_str):
    """
    Maps an ICD diagnosis code to ICD9
    """

    if dx_str.startswith("E"):
        if len(dx_str) > 4:
            return dx_str[:4] + "." + dx_str[4:]
        else:
            return dx_str
    else:
        if len(dx_str) > 3:
            return dx_str[:3] + "." + dx_str[3:]
        else:
            return dx_str

In [None]:
def convert_to_3digit_icd9(dx_str):
    """
    Roll up a diagnosis code to 3 digits
    """

    if dx_str.startswith("E"):
        if len(dx_str) > 4:
            return dx_str[:4]
        else:
            return dx_str
    else:
        if len(dx_str) > 3:
            return dx_str[:3]
        else:
            return dx_str

In [None]:
# Define the paths to your CSV files in the "MIMIC" folder in your Colab environment
admission_file = "/content/drive/MyDrive/MIMIC/ADMISSIONS.csv"
diagnosis_file = "/content/drive/MyDrive/MIMIC/DIAGNOSES_ICD.csv"
patients_file = "/content/drive/MyDrive/MIMIC/PATIENTS.csv"
out_directory = "/content/drive/MyDrive/MIMIC/output"
train_proportion = 0.8  # Modify the train proportion as needed

In [None]:
# Read mortality data
print("Collecting mortality information...")
pid_dod_map = {}
infd = open(patients_file, "r")
infd.readline()
for line in infd:
    tokens = line.strip().split(",")
    pid = int(tokens[1])
    dod_hosp = tokens[5]
    if len(dod_hosp) > 0:
        pid_dod_map[pid] = 1
    else:
        pid_dod_map[pid] = 0
infd.close()

Collecting mortality information...


In [None]:
# Read and create admission records
print("Building pid-admission mapping, admission-date mapping...")
pid_adm_map = {}
adm_date_map = {}
infd = open(admission_file, "r")
infd.readline()
for line in infd:
    tokens = line.strip().split(",")
    pid = int(tokens[1])
    adm_id = int(tokens[2])
    adm_time = datetime.strptime(tokens[3], "%Y-%m-%d %H:%M:%S")
    adm_date_map[adm_id] = adm_time
    if pid in pid_adm_map:
        pid_adm_map[pid].append(adm_id)
    else:
        pid_adm_map[pid] = [adm_id]
infd.close()

Building pid-admission mapping, admission-date mapping...


In [None]:
# Create admission dx code mapping
print("Building admission-dxList mapping...")
adm_dx_map = {}
adm_dx_map_3digit = {}
infd = open(diagnosis_file, "r")
infd.readline()
for line in infd:
    tokens = re.sub('"|\s|\n','',line).split(',')
    adm_id = int(tokens[2])
    dx_str = "D_" + convert_to_icd9(tokens[4][1:-1])
    dx_str_3digit = "D_" + convert_to_3digit_icd9(tokens[4][1:-1])
    if adm_id in adm_dx_map:
        adm_dx_map[adm_id].append(dx_str)
    else:
        adm_dx_map[adm_id] = [dx_str]
    if adm_id in adm_dx_map_3digit:
        adm_dx_map_3digit[adm_id].append(dx_str_3digit)
    else:
        adm_dx_map_3digit[adm_id] = [dx_str_3digit]
infd.close()

Building admission-dxList mapping...


In [None]:
# Create ordered visit mapping
print("Building pid-sortedVisits mapping...")
pid_seq_map = {}
pid_seq_map_3digit = {}
for pid, adm_id_list in pid_adm_map.items():
    if len(adm_id_list) < 2:
        continue
    sorted_list = sorted(
        [(adm_date_map[adm_id], adm_dx_map[adm_id]) for adm_id in adm_id_list]
    )
    pid_seq_map[pid] = sorted_list
    sorted_list_3digit = sorted(
        [
            (adm_date_map[adm_id], adm_dx_map_3digit[adm_id])
            for adm_id in adm_id_list
        ]
    )
    pid_seq_map_3digit[pid] = sorted_list_3digit

Building pid-sortedVisits mapping...


In [None]:
print("Building pids, dates, mortality_labels, strSeqs...")
pids = []
dates = []
seqs = []
morts = []
for pid, visits in pid_seq_map.items():
    pids.append(pid)
    morts.append(pid_dod_map[pid])
    seq = []
    date = []
    for visit in visits:
        date.append(visit[0])
        seq.append(visit[1])
    dates.append(date)
    seqs.append(seq)

Building pids, dates, mortality_labels, strSeqs...


In [None]:
# Create 3 digit ICD sequences
print("Building pids, dates, strSeqs for 3digit ICD9 code...")
seqs_3digit = []
for pid, visits in pid_seq_map_3digit.items():
    seq = []
    for visit in visits:
        seq.append(visit[1])
    seqs_3digit.append(seq)

Building pids, dates, strSeqs for 3digit ICD9 code...


In [None]:
# Collect code types
print("Converting strSeqs to intSeqs, and making types...")
types = {}
new_seqs = []
for patient in seqs:
    new_patient = []
    for visit in patient:
        new_visit = []
        for code in visit:
            if code in types:
                new_visit.append(types[code])
            else:
                types[code] = len(types)
                new_visit.append(types[code])
        new_patient.append(new_visit)
    new_seqs.append(new_patient)

Converting strSeqs to intSeqs, and making types...


In [None]:
# Map code strings to integers
print("Converting strSeqs to intSeqs, and making types for 3digit ICD9 code...")
types_3digit = {}
new_seqs_3digit = []
for patient in seqs_3digit:
    new_patient = []
    for visit in patient:
        new_visit = []
        for code in set(visit):
            if code in types_3digit:
                new_visit.append(types_3digit[code])
            else:
                types_3digit[code] = len(types_3digit)
                new_visit.append(types_3digit[code])
        new_patient.append(new_visit)
    new_seqs_3digit.append(new_patient)

Converting strSeqs to intSeqs, and making types for 3digit ICD9 code...


In [None]:
# Compute time to today as to_event column
print("Making additional modifications to the data...")
today = datetime.strptime("2025-01-01", "%Y-%m-%d")
to_event = [[(today - date).days for date in patient] for patient in dates]

Making additional modifications to the data...


In [None]:
# Compute time of the day when the person was admitted as the numeric column of size 1
numerics = [
    [[date.hour * 60 + date.minute - 720] for date in patient] for patient in dates
]

In [None]:
# Add this feature to dictionary but leave 1 index empty for PADDING
types["Time of visit"] = len(types) + 1
types_3digit["Time of visit"] = len(types_3digit) + 1

In [None]:
# Compute sorting indicies
sort_indicies = np.argsort(list(map(len, to_event)))

In [None]:
# Create the dataframes of data and sort them according to number of visits per patient
print("Building sorted dataframes...")
all_data = (
    pd.DataFrame(
        data={"codes": new_seqs, "to_event": to_event, "numerics": numerics},
        columns=["codes", "to_event", "numerics"],
    )
    .iloc[sort_indicies]
    .reset_index()
)
all_data_3digit = (
    pd.DataFrame(
        data={"codes": new_seqs_3digit, "to_event": to_event, "numerics": numerics},
        columns=["codes", "to_event", "numerics"],
    )
    .iloc[sort_indicies]
    .reset_index()
)
all_targets = (
    pd.DataFrame(data={"target": morts}, columns=["target"])
    .iloc[sort_indicies]
    .reset_index()
)

Building sorted dataframes...


In [None]:
# Create train test split
print("Creating train/test splits...")
data_train, data_test = train_test_split(
    all_data, train_size=train_proportion, random_state=12345
)
data_train_3digit, data_test_3digit = train_test_split(
    all_data_3digit, train_size=train_proportion, random_state=12345
)
target_train, target_test = train_test_split(
    all_targets, train_size=train_proportion, random_state=12345
)

Creating train/test splits...


In [None]:
# Create reverse dictionary in index:code format
types = dict((v, k) for k, v in types.items())
types_3digit = dict((v, k) for k, v in types_3digit.items())

In [None]:
# Write out the data
print("Saving data...")
if not os.path.exists(out_directory):
    os.makedirs(out_directory)
data_train.sort_index().to_pickle(out_directory + "/data_train.pkl")
data_test.sort_index().to_pickle(out_directory + "/data_test.pkl")
data_train_3digit.sort_index().to_pickle(out_directory + "/data_train_3digit.pkl")
data_test_3digit.sort_index().to_pickle(out_directory + "/data_test_3digit.pkl")
target_train.sort_index().to_pickle(out_directory + "/target_train.pkl")
target_test.sort_index().to_pickle(out_directory + "/target_test.pkl")
pickle.dump(types, open(out_directory + "/dictionary.pkl", "wb"), -1)
pickle.dump(types_3digit, open(out_directory + "/dictionary_3digit.pkl", "wb"), -1)

Saving data...
