In [1]:
 

import sys
sys.path.append('../')
import datasets
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings
from constants import MIMIC_3_DIR, DATA_DIR

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

import os



In [2]:
MIMIC_3_DIR, PROJECT_DIR

('/media/lixc/TOSHIBA EXT/data/mimic/mimicd/data',
 '/home/lixc/Downloads/data/caml-mimic/mimicdata/mimic3')

Let's do some data processing in a much better way, with a notebook.

First, let's define some stuff.

In [3]:
Y = 'full' #use all available labels in the dataset for prediction

# MIMIC_3_DIR = MIMIC_3_DIR.replace(" ", "\\ ").replace("?", "\\?").replace("&", "\\&").replace("(", "\\(").replace(")", "\\)").replace("*", "\\*").replace("<", "\\<").replace(">", "\\>")
# path = os.path.join(filepath,file)
notes_file = os.path.join(MIMIC_3_DIR ,"NOTEEVENTS.csv") # '%sNOTEEVENTS.csv' % MIMIC_3_DIR # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

## Combine diagnosis and procedure codes and reformat them

The codes in MIMIC-III are given in separate files for procedures and diagnoses, and the codes are given without periods, which might lead to collisions if we naively combine them. So we have to add the periods back in the right place.

In [4]:
dfproc = pd.read_csv(os.path.join(MIMIC_3_DIR ,"PROCEDURES_ICD.csv"))
dfdiag = pd.read_csv(os.path.join(MIMIC_3_DIR ,"DIAGNOSES_ICD.csv"))

In [10]:
dfproc.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,absolute_code
0,944,62641,154460,3,3404,34.04
1,945,2592,130856,1,9671,96.71
2,946,2592,130856,2,3893,38.93
3,947,55357,119355,1,9672,96.72
4,948,55357,119355,2,331,33.1


In [9]:
dfdiag.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,absolute_code
0,1297,109,172335,1.0,40301,403.01
1,1298,109,172335,2.0,486,486.0
2,1299,109,172335,3.0,58281,582.81
3,1300,109,172335,4.0,5855,585.5
4,1301,109,172335,5.0,4254,425.4


In [5]:
dfdiag['absolute_code'] = dfdiag.apply(lambda row: str(datasets.reformat(str(row[4]), True)), axis=1)
dfproc['absolute_code'] = dfproc.apply(lambda row: str(datasets.reformat(str(row[4]), False)), axis=1)

In [6]:
dfcodes = pd.concat([dfdiag, dfproc])

In [12]:
dfcodes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,absolute_code
0,1297,109,172335,1.0,40301,403.01
1,1298,109,172335,2.0,486,486.0
2,1299,109,172335,3.0,58281,582.81
3,1300,109,172335,4.0,5855,585.5
4,1301,109,172335,5.0,4254,425.4


In [13]:
dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'absolute_code'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])

## How many codes are there?

In [7]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

8994

## Tokenize and preprocess raw text

Preprocessing time!

This will:
- Select only discharge summaries and their addenda
- remove punctuation and numeric-only tokens, removing 500 but keeping 250mg
- lowercase all tokens

In [15]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = get_discharge_summaries.write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_3_DIR)

processing notes file
writing to /media/lixc/TOSHIBA EXT/data/mimic/mimicd/data/disch_full.csv


2083180it [00:59, 35253.66it/s] 


Let's read this in and see what kind of data we're working with

In [8]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_3_DIR)

In [9]:
#How many admissions?
len(df['HADM_ID'].unique())

52726

In [10]:
df.columns

Index(['SUBJECT_ID', 'HADM_ID', 'CHARTTIME', 'TEXT'], dtype='object')

In [11]:
df.TEXT.values[1000:1020]

array(['admission date discharge date service medicine allergies patient recorded as having no known allergies to drugs attending first name3 lf chief complaint chest discomfort dizziness blask stool major surgical or invasive procedure egd history of present illness yo m with cad s p cabg on coumadin for afib post surgical presents with to ed from rehab with evidence of ugib pt was discharged from hospital1 to rehab and was doing well it appears that he was treated for of incisional infection with diclox on the evening of pt c o diaphoresis nausous and a chest discomfort that felt different than his anginal equivalent and transferred to hospital1 for further eval on arrival pt reports black stools for days he denied hematemsis however in the ed witnessed coffee ground emesis ngt was place and hematemsis did not clear to 500cc ng lavage in ed vs hr bp initial hct and inr two ga ivs placed in right arm given anzimet protonix 2uprbc 3uffp vitamin k 5mgsubq after 2uprbc hct remained so pt

In [12]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [13]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

Num types 150854
Num tokens 79801387


In [14]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [15]:
#Sort the label file by the same
dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

  dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)


In [16]:
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

(52726, 58976)

## Consolidate labels with set of discharge summaries

Looks like there were some HADM_ID's that didn't have discharge summaries, so they weren't included with our notes

In [20]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/ALL_CODES.csv' % MIMIC_3_DIR, 'r') as lf:
    with open('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

In [17]:
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index_col=None)

In [26]:
dfl.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE,ADMITTIME,DISCHTIME
0,109,172335,403.01,,
1,109,172335,486.0,,
2,109,172335,582.81,,
3,109,172335,585.5,,
4,109,172335,425.4,,


In [18]:
len(dfl['HADM_ID'].unique())

52726

In [29]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index=False)

## Append labels to notes in a single file

In [30]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_3_DIR
df.to_csv(sorted_file, index=False)

In [31]:
labeled = concat_and_split.concat_data('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, sorted_file)

CONCATENATING
0 done
10000 done
20000 done
30000 done
40000 done
50000 done


In [32]:
#name of the file we just made
print(labeled)

/media/lixc/TOSHIBA EXT/data/mimic/mimicd/data/notes_labeled.csv


Let's sanity check the combined data we just made. Do we have all hadm id's accounted for, and the same vocab stats?

In [2]:
# ***************************************************
# rerun can start from here

labeled = "/media/lixc/TOSHIBA EXT/data/mimic/mimicd/data/notes_labeled.csv"
dfnl = pd.read_csv(labeled)
#Tokens and types
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [20]:
print("num types", len(types), "num tokens", num_tok)

num types 150854 num tokens 79801387


In [21]:
len(dfnl['HADM_ID'].unique())

52726

## Create train/dev/test splits

In [22]:
 

fname = '%s/notes_labeled.csv' % MIMIC_3_DIR
base_name = "%s/disch" % PROJECT_DIR #for output
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

SPLITTING
0 read
10000 read
20000 read
30000 read
40000 read
50000 read


## Build vocabulary from training data

In [23]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, tr, vname)

reading in data...
removing rare terms
51917 terms qualify out of 140795 total
writing output


## Sort each data split by length for batching

In [25]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (PROJECT_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (PROJECT_DIR, splt), index=False)

In [35]:
import gensim.models.word2vec as w2v
import gensim
gensim.__version__



'4.1.2'

## Pre-train word embeddings

Let's train word embeddings on all words

In [43]:
# reload a module in Python,

import importlib
importlib.reload(word_embeddings)

<module 'dataproc.word_embeddings' from '/home/lixc/Downloads/data/caml-mimic/notebooks/../dataproc/word_embeddings.py'>

In [3]:
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

building word2vec vocab on /media/lixc/TOSHIBA EXT/data/mimic/mimicd/data/disch_full.csv...
training...
writing embeddings to /media/lixc/TOSHIBA EXT/data/mimic/mimicd/data/processed_full.w2v


## Write pre-trained word embeddings with new vocab

In [46]:
 
importlib.reload(extract_wvs)

<module 'dataproc.extract_wvs' from '/home/lixc/Downloads/data/caml-mimic/notebooks/../dataproc/extract_wvs.py'>

In [47]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

100%|██████████| 51917/51917 [00:00<00:00, 228593.77it/s]


## Pre-process code descriptions using the vocab

In [64]:
importlib.reload(vocab_index_descriptions)
importlib.reload(datasets)



DATA_DIR = '/media/lixc/TOSHIBA EXT/data/mimic/mimicd/data'

create vocabulory vector

In [4]:
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR)

100%|██████████| 22267/22267 [00:00<00:00, 162276.89it/s]


## Filter each split to the top 50 diagnosis/procedure codes

In [5]:
Y = 50

In [6]:
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [7]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [10]:
codes_50 = [code[0] for code in codes_50[:Y]]

In [11]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [12]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (PROJECT_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (PROJECT_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

train
dev
test


In [39]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (PROJECT_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (PROJECT_DIR, splt, str(Y)), index=False)