In [2]:
import sys
sys.path.append("../") # go to parent dir

In [3]:
import pandas as pd
import numpy as np
import pyarrow as pa
import sys
import spacy
import re
import time
import scispacy
import glob
import os
from pandarallel import pandarallel
from tqdm import tqdm
tqdm.pandas()
from note_processing.heuristic_tokenize import sent_tokenize_rules 
from mimic3models.preprocessing import Discretizer_Notes

In [6]:
LOS_PATH = "/mnt/data01/mimic-3/benchmark-notes/length-of-stay"
LISTFILES = ["test_listfile.csv", "train_listfile.csv", "val_listfile.csv"]
# LISTFILES = ["val_listfile.csv"]
NOTEABR = "bert"
EMBEDDIM = 768
WORKERS = 2
TIMESTEP = 5

discretizer = Discretizer_Notes(timestep=TIMESTEP,
                          store_masks=False,  # Not currently supported
                          impute_strategy='previous',
                          start_time='zero',
                          sent_dim=80)

pandarallel.initialize(progress_bar=True, nb_workers=WORKERS)                          

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [174]:
def get_episode_embeddings(tup):
        ts_filename = tup["stay"]
        test_train = tup["set"]

        ret = []
        patient_id = re.findall(r'[0-9]+_', ts_filename)[0][:-1]
        episode = re.findall(r'episode[0-9]+_', ts_filename)[-1][7:-1]

        par_dir = os.path.abspath(os.path.join(LOS_PATH, os.pardir))

        filename = f"episode{episode}_notes_{NOTEABR}.parquet"
        filename = os.path.join(par_dir, test_train, patient_id, filename)
        
        columns = ["Hours", "CATEGORY", "DESCRIPTION", "TEXT_EMBEDDING"]
        try:
            df = pd.read_parquet(filename)
            columns = list(df.columns)
            df["Hours"] = df.index
            columns.insert(0, "Hours")
            ret = df[columns]
        except BaseException as e:
            print(f"Fail for patient: {patient_id} with error: {str(e)}")
            # TODO Remove hack
            ret = None

        return ret, filename

In [175]:
def process_episode(tup):
        ts_filename = tup["stay"]
        test_train = tup["set"]

        patient_id = re.findall(r'[0-9]+_', ts_filename)[0][:-1]
        episode = re.findall(r'episode[0-9]+_', ts_filename)[-1][7:-1]
        
        df, filename = get_episode_embeddings(tup)
        
        if df is not None:
            df_np = df.to_numpy()

            # Create tensor with impution
            (tensor, header) = discretizer.transform(df_np, header=None, end=int(tup["period_length"]))

            outfile = f"episode{episode}_notes_{NOTEABR}_bin{TIMESTEP}_tensor.parquet"
            out_df = pd.DataFrame([{"TEXT_BIN_EMBEDDING": tensor.tolist()}])
            out_df.to_parquet(os.path.join(os.path.dirname(filename), outfile))
        else:
            outfile = None

        return outfile
    

In [176]:

for listfile in LISTFILES:
    filename = os.path.join(LOS_PATH, listfile)
    df = pd.read_csv(filename)
    df["set"] = re.findall(r'(?:test|train|val)', listfile)[0]
    df["set"] = df["set"].apply(lambda x: "train" if x == "val" else x)

    group_df = df.groupby(["stay","set"], as_index=False)["period_length"].agg("max")

    # For each group build the imputed note tensor
    tensor_df = group_df.copy().reset_index()
    tensor_df["tensor"] = tensor_df.progress_apply(process_episode, axis=1)



100%|██████████| 2/2 [00:00<00:00,  6.68it/s]
 13%|█▎        | 5/39 [00:00<00:00, 34.23it/s]Fail for patient: 107 with error: /mnt/data01/mimic-3/benchmark-small/train/107/episode2_notes_bert.parquet
 77%|███████▋  | 30/39 [00:02<00:01,  8.40it/s]Fail for patient: 165 with error: /mnt/data01/mimic-3/benchmark-small/train/165/episode1_notes_bert.parquet
100%|██████████| 39/39 [00:03<00:00, 10.81it/s]
100%|██████████| 2/2 [00:00<00:00,  3.45it/s]


In [8]:
for listfile in LISTFILES:
    filename = os.path.join(LOS_PATH, listfile)
    df = pd.read_csv(filename)
    df["set"] = re.findall(r'(?:test|train|val)', listfile)[0]
    df["set"] = df["set"].apply(lambda x: "train" if x == "val" else x)

    group_df = df.groupby(["stay","set"], as_index=False)["period_length"].agg("max")

    print(group_df.head())
    print(group_df.describe())


                            stay   set  period_length
0  10000_episode1_timeseries.csv  test           31.0
1  10011_episode1_timeseries.csv  test          332.0
2  10012_episode1_timeseries.csv  test           33.0
3  10019_episode1_timeseries.csv  test           31.0
4   1001_episode1_timeseries.csv  test           21.0
       period_length
count    6265.000000
mean       87.970790
std       127.748688
min         5.000000
25%        28.000000
50%        49.000000
75%        91.000000
max      1992.000000
                            stay    set  period_length
0  10003_episode1_timeseries.csv  train           35.0
1  10004_episode1_timeseries.csv  train          251.0
2  10004_episode2_timeseries.csv  train           17.0
3  10006_episode1_timeseries.csv  train           39.0
4  10007_episode1_timeseries.csv  train          198.0
       period_length
count   29168.000000
mean       86.050295
std       123.965907
min         5.000000
25%        27.000000
50%        48.000000
75%       