# MIMIC NOTEEVENTS PRE-PROCESSING

In [None]:
import pandas as pd
import os

In [None]:
diagnoses = pd.read_csv('mimicdata/mimic3/DIAGNOSES_ICD.csv')
d_icd_diagnoses = pd.read_csv('mimicdata/D_ICD_DIAGNOSES.csv')
d_icd_procedures = pd.read_csv('mimicdata/D_ICD_PROCEDURES.csv')
noteevents = pd.read_csv('mimicdata/mimic3/NOTEEVENTS.csv', nrows=10)
print(noteevents.iloc[0])
print(noteevents.iloc[0]['TEXT'])

In [None]:
import sys
print(sys.path)
import datasets
import log_reg
from dataproc import extract_wvs
from dataproc import get_discharge_summaries
from dataproc import concat_and_split
from dataproc import build_vocab
from dataproc import vocab_index_descriptions
from dataproc import word_embeddings
from constants import MIMIC_3_DIR, DATA_DIR

import numpy as np
import pandas as pd

from collections import Counter, defaultdict
import csv
import math
import operator

In [None]:
Y = 'full' #use all available labels in the dataset for prediction
print(MIMIC_3_DIR)
notes_file = '%s/NOTEEVENTS.csv' % MIMIC_3_DIR # raw note events downloaded from MIMIC-III
vocab_size = 'full' #don't limit the vocab size to a specific number
vocab_min = 3 #discard tokens appearing in fewer than this many documents

In [None]:
print(notes_file)

In [None]:
dfproc = pd.read_csv('%s/PROCEDURES_ICD.csv' % MIMIC_3_DIR)
dfdiag = pd.read_csv('%s/DIAGNOSES_ICD.csv' % MIMIC_3_DIR)

In [None]:
dfdiag['absolute_code'] = dfdiag.apply(lambda row: str(datasets.reformat(str(row[4]), True)), axis=1)
dfproc['absolute_code'] = dfproc.apply(lambda row: str(datasets.reformat(str(row[4]), False)), axis=1)

In [None]:
dfcodes = pd.concat([dfdiag, dfproc])

In [None]:
dfcodes.to_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, index=False,
               columns=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'absolute_code'],
               header=['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'SEQ_NUM', 'ICD9_CODE'])

In [None]:
#In the full dataset (not just discharge summaries)
df = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR, dtype={"ICD9_CODE": str})
len(df['ICD9_CODE'].unique())

In [None]:
#This reads all notes, selects only the discharge summaries, and tokenizes them, returning the output filename
disch_full_file = get_discharge_summaries.write_discharge_summaries(out_file="%s/disch_full.csv" % MIMIC_3_DIR)

In [None]:
df = pd.read_csv('%s/disch_full.csv' % MIMIC_3_DIR)
#How many admissions?
len(df['HADM_ID'].unique())

In [None]:
#Tokens and types
types = set()
num_tok = 0
for row in df.itertuples():
    for w in row[4].split():
        types.add(w)
        num_tok += 1

In [None]:
print("Num types", len(types))
print("Num tokens", str(num_tok))

In [None]:
#Let's sort by SUBJECT_ID and HADM_ID to make a correspondence with the MIMIC-3 label file
df = df.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [None]:
#Sort the label file by the same
dfl = pd.read_csv('%s/ALL_CODES.csv' % MIMIC_3_DIR)
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])

In [None]:
len(df['HADM_ID'].unique()), len(dfl['HADM_ID'].unique())

In [None]:
#Let's filter out these HADM_ID's
hadm_ids = set(df['HADM_ID'])
with open('%s/ALL_CODES.csv' % MIMIC_3_DIR, 'r') as lf:
    with open('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, 'w') as of:
        w = csv.writer(of)
        w.writerow(['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ADMITTIME', 'DISCHTIME'])
        r = csv.reader(lf)
        #header
        next(r)
        for i,row in enumerate(r):
            hadm_id = int(row[2])
            #print(hadm_id)
            #break
            if hadm_id in hadm_ids:
                w.writerow(row[1:3] + [row[-1], '', ''])

In [None]:
dfl = pd.read_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index_col=None)

In [None]:
len(dfl['HADM_ID'].unique())

In [None]:
#we still need to sort it by HADM_ID
dfl = dfl.sort_values(['SUBJECT_ID', 'HADM_ID'])
dfl.to_csv('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, index=False)

In [None]:
#Now let's append each instance with all of its codes
#this is pretty non-trivial so let's use this script I wrote, which requires the notes to be written to file
sorted_file = '%s/disch_full.csv' % MIMIC_3_DIR
df.to_csv(sorted_file, index=False)

In [None]:
labeled = concat_and_split.concat_data('%s/ALL_CODES_filtered.csv' % MIMIC_3_DIR, sorted_file)

In [None]:
print(labeled)

In [None]:
dfnl = pd.read_csv(labeled)
#Tokens and types
types = set()
num_tok = 0
for row in dfnl.itertuples():
    for w in row[3].split():
        types.add(w)
        num_tok += 1

In [None]:
print("num types", len(types), "num tokens", num_tok)

In [None]:
len(dfnl['HADM_ID'].unique())

In [None]:
fname = '%s/notes_labeled.csv' % MIMIC_3_DIR
base_name = "%s/disch" % MIMIC_3_DIR #for output
tr, dv, te = concat_and_split.split_data(fname, base_name=base_name)

In [None]:
vocab_min = 3
vname = '%s/vocab.csv' % MIMIC_3_DIR
build_vocab.build_vocab(vocab_min, tr, vname)

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/disch_%s_split.csv' % (MIMIC_3_DIR, splt)
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_full.csv' % (MIMIC_3_DIR, splt), index=False)

In [None]:
w2v_file = word_embeddings.word_embeddings('full', '%s/disch_full.csv' % MIMIC_3_DIR, 100, 0, 5)

In [None]:
extract_wvs.gensim_to_embeddings('%s/processed_full.w2v' % MIMIC_3_DIR, '%s/vocab.csv' % MIMIC_3_DIR, Y)

In [None]:
print(MIMIC_3_DIR)
vocab_index_descriptions.vocab_index_descriptions('%s/vocab.csv' % MIMIC_3_DIR,
                                                  '%s/description_vectors.vocab' % MIMIC_3_DIR)

In [None]:
Y = 50
#first calculate the top k
counts = Counter()
dfnl = pd.read_csv('%s/notes_labeled.csv' % MIMIC_3_DIR)
for row in dfnl.itertuples():
    for label in str(row[4]).split(';'):
        counts[label] += 1

In [None]:
codes_50 = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
codes_50 = [code[0] for code in codes_50[:Y]]

In [None]:
with open('%s/TOP_%s_CODES.csv' % (MIMIC_3_DIR, str(Y)), 'w') as of:
    w = csv.writer(of)
    for code in codes_50:
        w.writerow([code])

In [None]:
for splt in ['train', 'dev', 'test']:
    print(splt)
    hadm_ids = set()
    with open('%s/%s_50_hadm_ids.csv' % (MIMIC_3_DIR, splt), 'r') as f:
        for line in f:
            hadm_ids.add(line.rstrip())
    with open('%s/notes_labeled.csv' % MIMIC_3_DIR, 'r') as f:
        with open('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), 'w') as of:
            r = csv.reader(f)
            w = csv.writer(of)
            #header
            w.writerow(next(r))
            i = 0
            for row in r:
                hadm_id = row[1]
                if hadm_id not in hadm_ids:
                    continue
                codes = set(str(row[3]).split(';'))
                filtered_codes = codes.intersection(set(codes_50))
                if len(filtered_codes) > 0:
                    w.writerow(row[:3] + [';'.join(filtered_codes)])
                    i += 1

In [None]:
for splt in ['train', 'dev', 'test']:
    filename = '%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y))
    df = pd.read_csv(filename)
    df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1)
    df = df.sort_values(['length'])
    df.to_csv('%s/%s_%s.csv' % (MIMIC_3_DIR, splt, str(Y)), index=False)

# STATIC FEATURE GENERATION
## DEMOGRAPHIC DATA

In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import json


In [None]:
## Static Features
## 
## -> length of stay
## -> admission type
## -> admission dx
## -> admission location
## -> discharge location
## -> ethnicity
## -> gender
## -> age (at admission)
## -> insurance
## -> marital status


path_to_files = '<<CS598-DLHC/MIMIC-III/>>'

patients = pd.read_csv(path_to_files + 'PATIENTS.csv', parse_dates=['DOB'])
admissions = pd.read_csv(path_to_files + 'ADMISSIONS.csv', parse_dates=['ADMITTIME', 'DISCHTIME'])

assert len(patients['SUBJECT_ID'].unique()) == len(patients['SUBJECT_ID'])
patients = patients.set_index("SUBJECT_ID")

static_df = admissions.join(patients, on='SUBJECT_ID', how='left', rsuffix='pat')
assert len(admissions) == len(static_df)

def bin_age(age):
    if age < 25:
        return '18-25'
    elif age < 45:
        return '25-45'
    elif age < 65:
        return '45-65'
    elif age < 89:
        return '65-89'
    else:
        return '89+'
    
def bin_los(los):
    if los < 2:
        return '1-2'
    elif los < 4:
        return '3-4'
    elif los < 7:
        return '5-7'
    elif los < 10:
        return '8-10'
    elif los < 15:
        return '10-15'
    else:
        return '15+'

## Length of Stay    
static_df['LENGTH_OF_STAY'] = (static_df['DISCHTIME'] - static_df['ADMITTIME']) / np.timedelta64(1, 'D')
static_df['LENGTH_OF_STAY'] = static_df['LENGTH_OF_STAY'].apply(bin_los)

## Age
static_df['AGE'] = static_df['ADMITTIME'].subtract(static_df['DOB']).dt.days / 365.242
static_df['AGE'] = static_df['AGE'].apply(bin_age)

## Output DF
static_df = static_df.filter(items=['HADM_ID',
                                    'SUBJECT_ID',
                                    'LENGTH_OF_STAY',
                                    'ADMISSION_TYPE',
                                    'DIAGNOSIS',
                                    'ADMISSION_LOCATION',
                                    'DISCHARGE_LOCATION',
                                    'ETHNICITY',
                                    'GENDER',
                                    'AGE',
                                    'INSURANCE',
                                    'MARITAL_STATUS'])

static_df.to_csv('output_df.csv')
for col in static_df.columns:
    print(len(static_df[col].unique()))

In [None]:
def generate_static_features(input_df, out_path = ''):
    feature_dict = dict()
    feature_val_dict = dict()


    columns = input_df.columns
    column_to_index = dict()
    for i, col in enumerate(columns):
        column_to_index[col] = i

    for i, row in input_df.iterrows():
        hadm_id = row[0]
        feature_dict[hadm_id] = []
        for col in columns:
            if col not in ['DIAGNOSIS','HADM_ID','SUBJECT_ID']:
                val = row[column_to_index[col]]
                if val not in feature_val_dict:
                    feature_val_dict[val] = len(feature_val_dict)
                feature_dict[hadm_id].append(feature_val_dict[val])
    json.dump(feature_dict, open(os.path.join(out_path, 'static_feature_dict.json'), 'w'))
    json.dump(feature_val_dict, open(os.path.join(out_path, 'static_feature_index_dict.json'), 'w'))

generate_static_features(static_df)

## MEDICATION DATA

In [None]:
meds = pd.read_csv(path_to_files + 'PRESCRIPTIONS.csv')


hadm_id = list(df['HADM_ID'].unique())
meds = meds[meds['HADM_ID'].isin(hadm_id)]
meds[meds['HADM_ID'] == 134157]
print(len(hadm_id))

meds_trimmed = meds.filter(items=['HADM_ID',
                                  'NDC'])

def generate_med_features(hadm_ids, med_df, out_path = ''):
    med_feature_dict = dict()
    meds_dict = dict()
    max_meds = 0
    
    for hadm_id in hadm_ids:
        hadm_id = str(int(hadm_id))
        med_feature_dict[hadm_id] = []
        
    for i, row in tqdm(med_df.iterrows()):
        hadm_id = str(int(row[0]))
        ndc = str(row[1])
        if ndc not in meds_dict:
            meds_dict[ndc] = len(meds_dict) + 1
        med_feature_dict[hadm_id].append(meds_dict[ndc])
        max_meds = max(max_meds, len(med_feature_dict[hadm_id]))
               
    json.dump(med_feature_dict, open(os.path.join(out_path, 'meds_feature_dict.json'), 'w'))
    json.dump(meds_dict, open(os.path.join(out_path, 'meds_feature_index_dict.json'), 'w'))

generate_med_features(hadm_id, meds_trimmed)

# MODEL TRAINING
## TRAIN NEW ALPACA MODEL - 50 Labels

In [None]:
!python learn/training.py mimicdata/mimic3/train_50.csv mimicdata/mimic3/vocab.csv 50 alpaca 200 --filter-size 10 --num-filter-maps 50 --dropout 0.2 --patience 10 --criterion prec_at_5 --lr 0.0001 --embed-file mimicdata/mimic3/processed_full.embed --gpu

## TRAIN NEW ALPACA MODEL - FULL Labels

In [None]:
!python learn/training.py mimicdata/mimic3/train_full.csv mimicdata/mimic3/vocab.csv full alpaca 150 --filter-size 10 --num-filter-maps 50 --dropout 0.2 --patience 10 --lr 0.0001 --criterion prec_at_8 --embed-file mimicdata/mimic3/processed_full.embed --gpu ## --meds=1 --med-embed-size=100 --med-pool-size=5