In [1]:
from gensim.models import Word2Vec

import pandas as pd
from sklearn.externals import joblib
import re
import numpy as np

In [None]:
# Read from Mimic III dataset
dfall = pd.read_csv("./NOTEEVENTS.csv")

In [None]:
# Take first 10000 entries for training and next 10000 entries for testing
df1 = dfall[:10000]
df2 = dfall[10000:20000]

In [None]:
# Save training and test set so we don't have to reread the entire Mimic III dataset again
joblib.dump(df1,'data10.pkl')
joblib.dump(df2,'data10_2.pkl')

In [2]:
# Loading from saved training and test set
df = joblib.load('data10.pkl')
df_test = joblib.load('data10_2.pkl')

In [3]:
# Identifying the first four columns
df_text = df[['CHARTDATE','CATEGORY','DESCRIPTION','TEXT']]
df_text_test = df_test[['CHARTDATE','CATEGORY','DESCRIPTION','TEXT']]

In [4]:
# Splitting input, converting to lowercase, and removing stopwords
from nltk.corpus import stopwords

sentences = []
stop_words = set(stopwords.words('english'))

for txt in df_text.TEXT:
    txt1 = re.sub('[\n]'," ",txt)
    txt1 = re.sub('[^A-Za-z ]+', '', txt1)
    mywords = txt1.split()
    # lowercase conversion
    mywords2 = [s.lower() for s in mywords]
    # stopword removal
    mywordsfinal = [w for w in mywords2 if not w in stop_words]
    sentences.append(mywordsfinal)

In [5]:
# train model
model = Word2Vec(sentences, min_count=1)

In [6]:
# summarize the loaded model
print(model)

Word2Vec(vocab=80296, size=100, alpha=0.025)


In [8]:
# access vector for one word
print(model['diabetes'])

[ 1.2509873e+00 -9.6305281e-01  1.3721237e+00  2.8473589e+00
  2.8971467e+00  2.2011979e+00  1.1478058e+00 -3.0994160e+00
  2.3493876e+00 -3.2954478e+00 -3.5761768e-01 -3.9191098e+00
  5.2294856e-01 -1.8996678e-04 -2.8025994e+00 -2.8875210e+00
 -1.2023911e+00 -4.1957664e+00  1.8169845e+00  3.7110291e+00
 -2.4828880e+00  8.8331610e-01  1.1222864e+00 -1.3904045e+00
  8.4261829e-01  8.5373676e-01  2.3549817e+00 -4.1269212e+00
 -1.3950155e+00 -1.7305713e+00  2.8717198e+00 -2.0229577e-01
 -9.6777368e-01 -1.5844293e+00  4.7527674e-01 -2.6853926e+00
  3.3484895e+00  1.4967873e+00  1.9704129e+00 -1.4775780e+00
 -3.7606511e-02 -3.3936349e-01 -3.5885308e+00 -1.3591757e+00
 -8.8212341e-01  6.4079103e+00 -7.3442745e-01  2.0794433e-01
 -2.1207695e+00  2.8285511e+00  2.9344249e-01  2.1233919e+00
  1.0667479e+00 -1.5349110e+00 -9.2842424e-01 -1.2911180e+00
 -9.2309093e-01 -2.7894027e+00 -7.9069728e-01  1.3244954e+00
  4.0246825e+00  1.1276380e+00  3.0100033e-01 -1.9238749e+00
 -4.7430444e+00 -2.30121

  


In [9]:
# save model so we don't have to recompute it every time
model.save('model.bin')

In [10]:
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=80296, size=100, alpha=0.025)


In [11]:
# given context disease, predict a word
print(new_model.predict_output_word(['disease'], topn = 1))

[('infectious', 0.9999976)]


In [12]:
# given diabetes, return the top 10 most similar words
print (new_model.wv.most_similar(['diabetes'], topn = 10))

[('dibetes', 0.7324691414833069), ('dependentdiabetes', 0.7132846117019653), ('diabtes', 0.6703452467918396), ('dm', 0.668785035610199), ('mellitus', 0.6328900456428528), ('diabete', 0.6298532485961914), ('noninsulin', 0.6298023462295532), ('insulindependent', 0.6017777919769287), ('noninsulindependent', 0.5867738127708435), ('triopathy', 0.5733495950698853)]


In [13]:
# display the text from the first medical note in the test set
print ((df_text_test.TEXT)[10000])

Admission Date:  [**2116-11-16**]              Discharge Date:   [**2116-11-20**]

Date of Birth:  [**2035-12-10**]             Sex:   M

Service: MEDICINE

Allergies:
Sulfonamides / Iodine; Iodine Containing / Influenza Virus
Vaccine

Attending:[**First Name3 (LF) 6195**]
Chief Complaint:
Found down, hypotensive in the ED


Major Surgical or Invasive Procedure:
None


History of Present Illness:
80 yo man with history of neuroendocrine tumor s/p colostomy and
chemotherapy, paroximal atrial fibrillation on coumadin who
presents being found down by wife. [**Name (NI) **] had been feeling weak for
4 days. Then today, he was taking his sock off when he slipped
off the bed and fell to floor. His wife reports that he did not
hit his head.  He was unable to crawl to phone to call 911. His
wife arrived after approximately 3 hours and called EMS. BP
initially 140/90.
.
In the ED: VS 102.4, 150/88, HR 94, RR16, 98% RA, EKG no change
per ED (not avail). Labs: lactate 1.0, Creatinine, Hct at
base

In [14]:
# Testing for the line "1. Prostate cancer status post definitive radiation treatment"
# that was found in the first test text above

# it returns cancer, which is the exact missing word, as the top prediction
print(new_model.predict_output_word(['prostate', 'status', 'post', 'definitive', 'radiation'], topn = 1))

[('cancer', 0.029171577)]


In [16]:
# Read in 300 disease names
xl = pd.ExcelFile("disease_names_synonyms_AAA.xlsx")
df = xl.parse("Sheet1")

In [17]:
# Define the first 5 columns
df_diseases = df[['Name','Synonym_1','Synonym_2','Synonym_3','Synonym_4']]

In [18]:
# Get all non-NaN entries to form a disease dictionary
all_diseases = {}

for disease in df.Name:
    if not (disease == np.nan):
        all_diseases[disease] = 1

for disease in df.Synonym_1:
    if not (disease == np.nan):
        all_diseases[disease] = 1

for disease in df.Synonym_2:
    if not (disease == np.nan):
        all_diseases[disease] = 1

for disease in df.Synonym_3:
    if not (disease == np.nan):
        all_diseases[disease] = 1

for disease in df.Synonym_4:
    if not (disease == np.nan):
        all_diseases[disease] = 1

In [19]:
# Process all text in test set by splitting, doing lowercase conversion, and stopword removal
testsentences = []

for txt in df_text_test.TEXT:
    txt1 = re.sub('[\n]'," ",txt)
    txt1 = re.sub('[^A-Za-z ]+', '', txt1)
    mywords = txt1.split()
    # lowercase conversion
    mywords2 = [s.lower() for s in mywords]
    # stopword removal
    mywordsfinal = [w for w in mywords2 if not w in stop_words]
    testsentences.append(mywordsfinal)

In [20]:
# Produce actual testing data by selecting a word from each medical note, and storing
# surrounding context words, as well as a label for whether the selected word was found
# in the disease dictionary
test_examples = []
test_correct_outputs = []

cutoffindex = 0
for testsentence in testsentences:
    slen = len(testsentence)
    
    diseasefound = False
    
    wordindex = 0
    
    # For first 5000 medical notes, try and find a disease from the selected word
    if cutoffindex < 5000:
    
        wordindex = 3
    
        while wordindex < (slen - 4):
            if testsentence[wordindex] in all_diseases:
                diseasefound = True
                break
            wordindex += 1
    
    # Pick a random word with at least 3 words before and after it in the text
    if not diseasefound:
        wordindex = np.random.randint(3, slen - 4)
    
    if testsentence[wordindex] in all_diseases:
        test_correct_outputs.append(1)
    else:
        test_correct_outputs.append(0)
    
    # take the surrounding context words as the 3 words before and after it
    test_examples.append(testsentence[(wordindex - 3):wordindex] + testsentence[(wordindex + 1):(wordindex + 4)])
    
    cutoffindex += 1

In [21]:
# Making sure the first test data looks OK
print (test_examples[0])
# Print out number of positive examples (examples for which the selected word was in the
# disease dicitonary), it should be close to 5000
print (sum(test_correct_outputs))

['iodine', 'iodine', 'containing', 'virus', 'vaccine', 'attendingfirst']
4914


In [22]:
# Testing model on created test data
test_results = []

for myexample in test_examples:
    # topn is the number of predictions the model outputs, both values of 1 and 10
    # were tried
    mypred = new_model.predict_output_word(myexample, topn = 1)
    myresult = 0
    # if any of the topn predictions is found in the disease dictionary, the result is
    # labeled as a disease prediction
    for pred in mypred:
        if pred[0] in all_diseases:
            myresult = 1
    test_results.append(myresult)

In [24]:
# Counting up true/false positives and true/false negatives
disease_correct = 0 # True positive
disease_incorrect = 0 # False negative
nondisease_correct = 0 # True negative
nondisease_incorrect = 0 # False positive

for i in range(len(test_correct_outputs)):
    if (test_results[i] == 1) and (test_correct_outputs[i] == 1):
        disease_correct += 1
    if (test_results[i] == 0) and (test_correct_outputs[i] == 0):
        nondisease_correct += 1
    if (test_results[i] == 1) and (test_correct_outputs[i] == 0):
        nondisease_incorrect += 1
    if (test_results[i] == 0) and (test_correct_outputs[i] == 1):
        disease_incorrect += 1

print (disease_correct)
print (disease_incorrect)
print (nondisease_correct)
print (nondisease_incorrect)

687
4227
4979
107
