In [1]:
import pandas as pd
from dateutil import parser
import matplotlib.pyplot as plt
from nltk.tokenize.treebank import TreebankWordTokenizer
import re
import string
import identification_transformer 
from data_cleaning import word_tokenize_by_string
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle

### load current vocab

In [11]:
'''
Args:
   notes - list of notes from patients, [notes from patient1, notes from patient2, ...]
   k - the number of words to choose from tf-idf
Return:
    each line is top k words of a patient 
'''
def tf_idf(notes, k):
    vectorizer = TfidfVectorizer(tokenizer = word_tokenize_by_string)
    vec = vectorizer.fit_transform(notes).toarray()
    #index_list = list(vectorizer.idf_.argsort()[:number])
    vec = [list(vec[i].argsort()[-k:][::-1]) for i in range(vec.shape[0])]
    word_list = vectorizer.get_feature_names()
    tf_idf = []
    for i in range(len(vec)):
        words_list = []
        for j in range(len(vec[0])):
            words_list.append(word_list[vec[i][j]])
        tf_idf.append(words_list)
    return tf_idf

In [None]:
#for tf-idf
vocab = set()
for i in range(7):
    print(i)
    tfidf_list = tf_idf(notes[i*5000:min((i+1)*5000, len(notes))], 500)
    for li in tfidf_list:
        vocab = vocab.union(set(li))

In [228]:
vocab = np.load("voc_100.npy")
vocab_set = set(vocab)

In [230]:
len(vocab)

181443

In [229]:
word_to_ix = {}
index = 1
ind_to_word = ['']
for voc in vocab:
    word_to_ix[voc] = index
    index += 1
    ind_to_word.append(voc)

### cleaned_notes

In [12]:
import pickle
reader = pickle.load(open('cleaned_notes.pickle', 'rb'))

  


In [47]:
reader.keys()

dict_keys(['CHARTTIME', 'DOCUMENTS', 'SUBJECT_ID'])

In [48]:
len(reader['CHARTTIME'])

1280007

### rank cleaned notes

In [15]:
sort_index = sorted(range(len(reader['CHARTTIME'])), key=lambda k: reader['CHARTTIME'][k])

In [17]:
cleaned_notes = pd.read_csv("cleaned_notes.csv")

In [18]:
charttime = [reader['CHARTTIME'][sort_index[i]]  for i in range(len(reader['CHARTTIME']))]
subjectid = [reader['SUBJECT_ID'][sort_index[i]]  for i in range(len(reader['CHARTTIME']))]
cleanednotes = [cleaned_notes['DOCUMENTS'][sort_index[i]] for i in range(len(reader['CHARTTIME']))]

### patient

In [51]:
#number of patient
patient_number = len(set(reader['SUBJECT_ID']))
patient_number

34872

In [130]:
note_number = len(reader['SUBJECT_ID'])
note_number

1280007

In [52]:
all_patient_list = list(set(reader['SUBJECT_ID']))

In [53]:
len(all_patient_list)

34872

In [170]:
patient = pickle.load(open('patient_timesheet_final.pickle', 'rb'))

In [149]:
patient[4]

{'ADMITTIME': array(['2191-03-16T00:28:00.000000000'], dtype='datetime64[ns]'),
 'CHARTTIME': array(['2191-03-19T16:53:00.000000000', '2191-03-22T10:13:00.000000000',
        '2191-03-18T08:10:00.000000000', '2191-03-21T18:12:00.000000000',
        '2191-03-15T19:55:00.000000000', '2191-03-15T16:20:00.000000000',
        '2191-03-16T12:57:00.000000000', '2191-03-16T15:27:00.000000000',
        '2191-03-16T18:52:00.000000000', '2191-03-17T06:41:00.000000000',
        '2191-03-16T05:01:00.000000000'], dtype='datetime64[ns]'),
 'CHARTTIME_interval': array([5305, 9225, 3342, 8264, -273, -488,  749,  899, 1104, 1813,  273], dtype='timedelta64[m]'),
 'CHARTTIME_todeath': array([ 5868,  1948,  7831,  2909, 11446, 11661, 10424, 10274, 10069,
         9360, 10900], dtype='timedelta64[m]'),
 'CHARTTIME_valid': [numpy.datetime64('2191-03-19T16:53:00.000000000'),
  numpy.datetime64('2191-03-22T10:13:00.000000000'),
  numpy.datetime64('2191-03-18T08:10:00.000000000'),
  numpy.datetime64('2191-03-21

In [190]:
split = pickle.load(open('patient_id_splited_data_Nov28.p', 'rb'))

In [168]:
notes_selected = pickle.load(open('subID_notes_selected_Nov28.p', 'rb'))

### get data

In [63]:
def get_label(idx_dict):
    
    mortality_label = np.zeros(len(idx_dict.keys()))
    corresponding_id = np.zeros(len(idx_dict.keys()), dtype = int)
    for key in idx_dict.keys():
        mortality_label[idx_dict[key]] = 0 if (patient[key]['DEATHTIME'][0] == np.datetime64('NaT')) else 1
        corresponding_id[idx_dict[key]] = key
    return mortality_label, corresponding_id
    

In [77]:
def get_idx_dict(split):
    idx_dict = {}
    num = 0
    for idx in split:
        idx_dict[idx] = num
        num += 1
    return idx_dict


In [81]:
from datetime import datetime, timedelta

epoch = datetime.utcfromtimestamp(0)

def dt_from_ms(ms):
    return datetime.utcfromtimestamp(ms / 1000.0)

def dt_to_ms(dt):
    delta = dt - epoch
    return int(delta.total_seconds() * 1000)

def dt64_to_ms(dt):
    delta = dt - np.datetime64(epoch)
    return int(delta.item() / 1e6)


In [210]:
def get_notes_vec_partial(note_number, time_set, idx_dict):
    #note_number = len(notes_selected[note_key])
    notes_vec = np.empty((len(idx_dict),),dtype =object)
    note_index = np.empty((len(idx_dict),),dtype =object)
    timetodeath_vec= np.empty((len(idx_dict),),dtype =object)
    for i,v in enumerate(notes_vec):
        notes_vec[i] = []
        note_index[i] = []
        timetodeath_vec[i] = []
    num = 0
    for i in range(note_number):
        
        if subjectid[i] not in idx_dict:
            continue
        #if charttime[i] not in note_time_set:
        #    continue
        if (subjectid[i], dt_to_ms(charttime[i])) not in time_set:
            continue
        num += 1
        try:
            sentences = cleanednotes[i].split(".")
            note_index[idx_dict[subjectid[i]]].append(i)
        except AttributeError:
            continue
        counts = []
        
        idxx = idx_dict[subjectid[i]]
        timetodeath = (patient[subjectid[i]]['DISCHTIME'][0] - np.datetime64(charttime[i])).astype('timedelta64[m]').astype(int)
        timetodeath_vec[idxx].append(timetodeath)
        for sentence in sentences:
            words = sentence.strip(" ").split(" ")
            count = []
            for word in words:
                if word in vocab_set:
                    count.append(word_to_ix[word])
            counts.append(count)
        notes_vec[idxx].append(counts)
    return notes_vec, note_index, timetodeath_vec

In [256]:
def get_notes_vec_concat(note_number, time_set, idx_dict):
    #note_number = len(notes_selected[note_key])
    notes_vec =np.empty((len(idx_dict),),dtype =object)
    note_index =np.empty((len(idx_dict),),dtype =object)
    timetodeath_vec= np.empty((len(idx_dict),),dtype =object)
    for i,v in enumerate(notes_vec):
        notes_vec[i] = []
        note_index[i] = []
        timetodeath_vec[i] = []
    num = 0
    for i in range(note_number):
        
        if subjectid[i] not in idx_dict:
            continue
        #if charttime[i] not in note_time_set:
        #    continue
        if (subjectid[i], dt_to_ms(charttime[i])) not in time_set:
            continue
        num += 1
        try:
            sentences = cleanednotes[i].split(".")
            note_index[idx_dict[subjectid[i]]].append(i)
        except AttributeError:
            continue
        counts = []
        idxx = idx_dict[subjectid[i]]
        timetodeath = (patient[subjectid[i]]['DISCHTIME'][0] - np.datetime64(charttime[i])).astype('timedelta64[m]').astype(int)
        timetodeath_vec[idxx].append(timetodeath)
        for sentence in sentences:
            words = sentence.strip(" ").split(" ")
            #count = []
            for word in words:
                if word in vocab_set:
                    counts.append(word_to_ix[word])
            #counts.append(count)
        notes_vec[idx_dict[subjectid[i]]].append(counts)
    return notes_vec, note_index, timetodeath_vec

In [257]:
split.keys()

dict_keys(['24h', '6h', '12h', '15m'])

In [372]:
#get data for 15m, 6h, 12h, 24h
for key in split.keys():
    key1 = key[:-1]
    train_st = 'train_' + key1
    val_st = 'val_'+ key1
    test_st = 'test_' + key1
    train_idx_dict = get_idx_dict(split[key][train_st])
    val_idx_dict = get_idx_dict(split[key][val_st])
    test_idx_dict = get_idx_dict(split[key][test_st])
    if key == '15m':
        note_key = '15min'
    else:
        note_key = key
    notetime= np.array(notes_selected[note_key])[:,1]
    idlist = np.array(notes_selected[note_key])[:,0] 
    notetime_ms_list = [(idlist[i],dt64_to_ms(notetime[i])) for i in range(len(notetime))]
    notetime_set = set(notetime_ms_list)
    train_note_vec, train_note_index, train_timetodeath_vec = get_notes_vec_partial(note_number, notetime_set, train_idx_dict)
    val_note_vec, val_note_index, val_timetodeath_vec = get_notes_vec_partial(note_number, notetime_set, val_idx_dict)
    test_note_vec, test_note_index, test_timetodeath_vec = get_notes_vec_partial(note_number, notetime_set, test_idx_dict)
    #train_note_vec, train_note_index, train_timetodeath_vec= get_notes_vec_concat(note_number, notetime_set, train_idx_dict)
    #val_note_vec, val_note_index, val_timetodeath_vec = get_notes_vec_concat(note_number, notetime_set, val_idx_dict)
    #test_note_vec, test_note_index, test_timetodeath_vec = get_notes_vec_concat(note_number, notetime_set, test_idx_dict)
    train_mortality_label, train_corresponding_id = get_label(train_idx_dict)
    val_mortality_label, val_corresponding_id = get_label(val_idx_dict)
    test_mortality_label, test_corresponding_id = get_label(test_idx_dict)   
    train = {}
    train["DATA"] = train_note_vec
    train["MORTALITY_LABEL"] = train_mortality_label
    train["SUBJECT_ID"] = train_corresponding_id
    train["NOTE_ID"] = train_note_index
    train["TIME_TO_DEATH"] = train_timetodeath_vec
    val = {}
    val["DATA"] = val_note_vec
    val["MORTALITY_LABEL"] = val_mortality_label
    val["SUBJECT_ID"] = val_corresponding_id
    val["NOTE_ID"] = val_note_index
    val["TIME_TO_DEATH"] = val_timetodeath_vec
    test = {}
    test["DATA"] = test_note_vec
    test["MORTALITY_LABEL"] = test_mortality_label
    test["SUBJECT_ID"] = test_corresponding_id
    test["NOTE_ID"] = test_note_index
    test["TIME_TO_DEATH"] = test_timetodeath_vec
    np.save("train_" + key, train)
    np.save("val_" + key, val)
    np.save("test_" + key, test)
    

  


### for entire stay

In [244]:
#for all
def get_notes_vec_all(note_number, idx_dict, word_to_ix):
    notes_vec =np.empty((len(idx_dict),),dtype =object)
    note_index =np.empty((len(idx_dict),),dtype =object)
    timetodeath_vec= np.empty((len(idx_dict),),dtype =object)
    for i,v in enumerate(notes_vec):
        notes_vec[i] = []
        note_index[i] = []
        timetodeath_vec[i] = []
    for i in range(note_number):
        if (i % 100000 == 0):
            print(i)
        if subjectid[i] not in idx_dict:
            continue
        try:
            sentences = cleanednotes[i].split(".")
            note_index[idx_dict[subjectid[i]]].append(i)
        except AttributeError:
            continue
        counts = []
        idxx = idx_dict[subjectid[i]]
        timetodeath = (patient[subjectid[i]]['DISCHTIME'][0] - np.datetime64(charttime[i])).astype('timedelta64[m]').astype(int)
        timetodeath_vec[idxx].append(timetodeath)
        for sentence in sentences:
            words = sentence.strip(" ").split(" ")
            count = []
            for word in words:
                if word in vocab_set:
                    count.append(word_to_ix[word])
            counts.append(count)
        notes_vec[idx_dict[subjectid[i]]].append(counts)
    return notes_vec, note_index, timetodeath_vec

In [269]:
#for all
def get_notes_vec_all_concat(note_number, idx_dict, word_to_ix):
    notes_vec =np.empty((len(idx_dict),),dtype =object)
    note_index =np.empty((len(idx_dict),),dtype =object)
    timetodeath_vec= np.empty((len(idx_dict),),dtype =object)
    for i,v in enumerate(notes_vec):
        notes_vec[i] = []
        note_index[i] = []
        timetodeath_vec[i] = []
    for i in range(note_number):
        if (i % 100000 == 0):
            print(i)
        if subjectid[i] not in idx_dict:
            continue
        try:
            sentences = cleanednotes[i].split(".")
            note_index[idx_dict[subjectid[i]]].append(i)
        except AttributeError:
            continue
        counts = []
        idxx = idx_dict[subjectid[i]]
        timetodeath = (patient[subjectid[i]]['DISCHTIME'][0] - np.datetime64(charttime[i])).astype('timedelta64[m]').astype(int)
        timetodeath_vec[idxx].append(timetodeath)
        for sentence in sentences:
            words = sentence.strip(" ").split(" ")
            #count = []
            for word in words:
                if word in vocab_set:
                    counts.append(word_to_ix[word])
            #counts.append(count)
        notes_vec[idxx].append(counts)
    return notes_vec, note_index, timetodeath_vec

In [234]:
def data_split(patient_list, downsampling_rate = 0.3):
    # Argments:
    #       patient_list : in format [[patient_id, label]] where death label = 1 otherwise label = 0
    #       downsampling_rate : the percentage of negative sample in the final training data
    # Output:
    #        train_data, validation data and test_data
    
    while True:
        
        np.random.rand(patient_list.shape[0]).argsort()
        np.take(patient_list,np.random.rand(patient_list.shape[0]).argsort(),axis=0,out=patient_list)
        patient_list = np.array(patient_list)
        num_patients = len(patient_list[:,0])
        train_data = patient_list[:int(0.7*num_patients)]
        val_data   = patient_list[int(0.7*num_patients):int(0.8*num_patients)]
        test_data  = patient_list[int(0.8*num_patients):num_patients]
        
        #downsampling
        if sum(train_data[:,1]) > 10:
            break
        else:
            np.random.rand(patient_list.shape[0]).argsort()
            np.take(patient_list,np.random.rand(patient_list.shape[0]).argsort(),axis=0,out=patient_list)
    
    if sum(train_data[:,1])/len(train_data[:,1]) <= 0.3:

        downsampling_size = int(sum(train_data[:,1])*(1 - downsampling_rate)/downsampling_rate)

        train_data_survive = train_data[train_data[:,1] != 1][:downsampling_size]
        train_data_dead = train_data[train_data[:,1] == 1]
        train_data = np.vstack((train_data_survive,train_data_dead))
        #random.shuffle(train_data)
        np.random.rand(train_data.shape[0]).argsort()
        np.take(train_data,np.random.rand(train_data.shape[0]).argsort(),axis=0,out=train_data)
        print('The percentage of negative sample after downsampling is {:.1%}'.format(sum(train_data[:,1])/len(train_data[:,1])))
        return train_data[:,0], val_data[:,0], test_data[:,0]

    else:
        print('The percentage of negative sample is {:.1%}'.format(sum(train_data[:,1])/len(train_data[:,1])))
        return train_data[:,0], val_data[:,0], test_data[:,0]

In [352]:
patient_list_for_split = [[key, int(patient[key]['DEATHTIME'][0] != np.datetime64('NaT'))] for key in all_patient_list]
train_all, val_all, test_all = data_split(np.array(patient_list_for_split))

  """Entry point for launching an IPython kernel.


The percentage of negative sample after downsampling is 30.0%


In [354]:
len(train_all)

11263

In [355]:
#for all
train_idx_dict = {}
num = 0
for idx in train_all:
    if idx not in train_idx_dict:
        train_idx_dict[idx] = num
        num += 1
val_idx_dict = {}
num = 0
for idx in val_all:
    if idx not in val_idx_dict:
        val_idx_dict[idx] = num
        num += 1
test_idx_dict = {}
num = 0
for idx in test_all:
    if idx not in test_idx_dict:
        test_idx_dict[idx] = num
        num += 1

In [356]:
train_notes_vec, train_note_index, train_timetodeath_vec = get_notes_vec_all_concat(note_number, train_idx_dict, word_to_ix)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [357]:
val_notes_vec, val_note_index, val_timetodeath_vec = get_notes_vec_all_concat(note_number, val_idx_dict, word_to_ix)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [358]:
test_notes_vec, test_note_index, test_timetodeath_vec = get_notes_vec_all_concat(note_number, test_idx_dict, word_to_ix)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000


In [359]:
train_mortality_label, train_corresponding_id = get_label(train_idx_dict)
val_mortality_label, val_corresponding_id = get_label(val_idx_dict)
test_mortality_label, test_corresponding_id = get_label(test_idx_dict)

  


In [360]:
train = {}
train["DATA"] = train_notes_vec
train["MORTALITY_LABEL"] = train_mortality_label
train["SUBJECT_ID"] = train_corresponding_id
train["NOTE_ID"] = train_note_index
train["TIME_TO_DEATH"] = train_timetodeath_vec

In [361]:
val = {}
val["DATA"] = val_notes_vec
val["MORTALITY_LABEL"] = val_mortality_label
val["SUBJECT_ID"] = val_corresponding_id
val["NOTE_ID"] = val_note_index
val["TIME_TO_DEATH"] = val_timetodeath_vec

In [362]:
test = {}
test["DATA"] = test_notes_vec
test["MORTALITY_LABEL"] =test_mortality_label
test["SUBJECT_ID"] = test_corresponding_id
test["NOTE_ID"] = test_note_index
test["TIME_TO_DEATH"] = test_timetodeath_vec

In [363]:
np.save("test_all", test)
np.save("train_all", train)
np.save("val_all", val)

In [364]:
train['DATA'][0]

[[2810,
  52575,
  157049,
  52105,
  57420,
  178376,
  22927,
  106876,
  21430,
  49904,
  43315,
  6317,
  130635,
  51138,
  161757,
  89834,
  157049,
  6147,
  37224,
  44637,
  14514,
  77426,
  122038,
  179007,
  95858,
  25440,
  77426,
  34000,
  157049,
  31432,
  77426,
  10138,
  3138,
  179007,
  6161,
  172308,
  44119,
  77426,
  156037,
  141287,
  179007,
  103806,
  15199,
  102498,
  125638,
  55593,
  77426,
  82278,
  36466,
  65679,
  33867,
  40268,
  14514,
  10795,
  173721,
  177663,
  169845,
  179007,
  48871,
  33867,
  40268,
  14514,
  101811,
  131243,
  14514,
  77426,
  30312,
  179007,
  130577,
  142304,
  77426,
  122038,
  179007,
  95858,
  25440,
  77426,
  34000,
  157049,
  31432,
  77426,
  58750,
  14514,
  77426,
  122490,
  179007,
  172308,
  78202,
  94529,
  106360,
  133922,
  4610,
  156853,
  36466,
  12958,
  143978,
  40268,
  14514,
  101811,
  131243,
  158070,
  179007,
  180175,
  28519,
  135777,
  163236,
  178376,
  41767,

### tf-idf test

In [72]:
string1 = "ad bd cd dd bd ad"
string2 = "bd cd dd ed ad"
string3 = "cd dd ed fd ad ad bd ad "
string4 = "cd ab ac ae ad ad ad"
number = 3
notes = [string1, string2, string3, string4]
vectorizer = TfidfVectorizer()
vec = vectorizer.fit_transform(notes).toarray()
vec

array([[ 0.        ,  0.        ,  0.56612943,  0.        ,  0.69245756,
         0.28306471,  0.34622878,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.37075826,  0.        ,  0.45349057,
         0.37075826,  0.45349057,  0.56015108,  0.        ],
       [ 0.        ,  0.        ,  0.68921051,  0.        ,  0.28100113,
         0.22973684,  0.28100113,  0.3470923 ,  0.44024257],
       [ 0.41800463,  0.41800463,  0.65439647,  0.41800463,  0.        ,
         0.21813216,  0.        ,  0.        ,  0.        ]])

In [73]:
notes

['ad bd cd dd bd ad',
 'bd cd dd ed ad',
 'cd dd ed fd ad ad bd ad ',
 'cd ab ac ae ad ad ad']

In [74]:
tf_idf(notes, 3)

[['bd', 'ad', 'dd'],
 ['ed', 'dd', 'bd'],
 ['ad', 'fd', 'ed'],
 ['ad', 'ae', 'ac']]

In [70]:
vectorizer.idf_

array([ 1.91629073,  1.91629073,  1.        ,  1.91629073,  1.22314355,
        1.        ,  1.22314355,  1.51082562,  1.91629073])

In [66]:
vectorizer.vocabulary_

{'ab': 0,
 'ac': 1,
 'ad': 2,
 'ae': 3,
 'bd': 4,
 'cd': 5,
 'dd': 6,
 'ed': 7,
 'fd': 8}