In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import dizzy_util as util
import pandas as pd
pd.set_option('display.max_rows', 100)

## data preparation

In [3]:
dataset = util.extractDataset("B00_ML4TrgPos_", {"WithRole", "AllMed", "Vital", "HF", "DispensedDrug"})



Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,TABLE_TYPE
0,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_01_04_cohort,BASE TABLE
1,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_05_04_Rad,BASE TABLE
2,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_30_Note_WithRole,BASE TABLE
3,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_30_Note,BASE TABLE
4,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_04_04_Lab,BASE TABLE
5,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_04_RxOutpat,BASE TABLE
6,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_07_NonVAMed,BASE TABLE
7,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_05_BCMA_Dispense...,BASE TABLE
8,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_08_AllMed,BASE TABLE
9,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_08_Consult,BASE TABLE




dict_keys(['cohort', 'Rad', 'Note', 'Lab', 'RxOutpat', 'NonVAMed', 'Consult', 'ICD', 'new', 'Demorgraphics'])




In [4]:
dizziness_df, label_map = util.retrieveLabels() # labeled
label_df = util.convertLabelMap(label_map) # PatientSSN | Label (100)
datasubset = util.extractAndStandarizeCohort(dataset, "Dizziness") # dataset for dizziness (All).
edstart = util.extractFirstVisitIndexDatetime(datasubset["cohort"]) # get first entry of each patient

In [5]:
datasubset["cohort"] = datasubset["cohort"].merge(label_df.rename(columns={"PatientSSN":"patientSSN"})) # concat label

In [6]:
def extractSubset(ds, subsetNames):
    pkeys = util.getPrimaryKeys(dataset)
    cohort_key = pkeys["cohort"]
    cohort_subset = ds["cohort"][ds["cohort"].Label.isin(subsetNames)]
    ids = pd.DataFrame({cohort_key: cohort_subset[cohort_key].unique()})
    dss = dict()
    for table, df in ds.items():
        dss[table] = ids.merge(df, how="inner", left_on=cohort_key, right_on=pkeys[table])
        if pkeys[table] != cohort_key:
            dss[table] = dss[table].drop([cohort_key], axis = 1)
        dss[table][pkeys[table]] = dss[table][pkeys[table]].astype(int)
    return dss

In [8]:
modsubset = extractSubset(datasubset, ["MOD"]) # dizziness MOD cases
nomodsubset = extractSubset(datasubset, ["NoMOD"]) # dizziness no-MOD cases

In [9]:
def getEDorPhysician(df):
    pattern = 'EMERGENCY DEPT NOTE|PHYSICIAN NOTE'
    return df[df.TIUStandardTitle.str.contains(pattern)]

In [10]:
mod_ed_or_physician_df = getEDorPhysician(modsubset['Note']).sort_values("EntryDateTime")
nomod_ed_or_physician_df = getEDorPhysician(nomodsubset['Note']).sort_values("EntryDateTime")

In [11]:
from collections import defaultdict
def getLastEntryOfNote(df):
    idnote_to_note = defaultdict(str)
    for index, row in df.iterrows():
        idnote_to_note[(row['PatientSSN'], row['TIUStandardTitle'])] = row['ReportText']
    return idnote_to_note

In [12]:
filtered_mod_dict = getLastEntryOfNote(mod_ed_or_physician_df)
filtered_nomod_dict = getLastEntryOfNote(nomod_ed_or_physician_df)
filtered_mod_df = pd.DataFrame(filtered_mod_dict.values(), columns =["ReportText"])
filtered_nomod_df = pd.DataFrame(filtered_nomod_dict.values(), columns =["ReportText"])

In [13]:
mod_texts = filtered_mod_df.reset_index(drop=True)
nomod_texts = filtered_nomod_df.reset_index(drop=True)

In [14]:
import re
def preprocessing(text):
    y = ''
    if text:
        y = text.lower()
        y = re.sub(r'\\[(.*?)\\]', '', y)
        y = re.sub(r'[0-9]+\.', '', y)
        y = re.sub(r'dr\.', 'doctor', y)
        y = re.sub(r'm\.d\.', 'md', y)
        y = re.sub(r'--|__|==', '', y) 
        y = re.sub(r'y\.o\.', 'year old', y)
        y = re.sub(r'fh', 'family history', y)
        y = re.sub(r'sh:', 'social history:', y)
        y = re.sub(r'\r\n', ' ', y)
        y = re.sub(r' :', ':', y)
        y = re.sub(r'physical examination', 'physical exam', y)
        y = re.sub(r'medications/iv:', 'medications:', y)
        ######### prevent catching as section
        y = re.sub(r'consult', 'consultation', y)
        y = re.sub(r'allergies', 'allergy', y) # allergies: stay the same but allergie -> allergy
        y = re.sub(r'allergy:', 'allergies:', y)
        y = re.sub(r'past history', 'past histories', y) # past history: stay the same but history -> histories, not ran for mod.
        y = re.sub(r'past histories:', 'past history:', y) # not ran for mods.
        y = re.sub(r'/ ', '/', y)
        y = re.sub(r'plan/disposition', 'plan', y)
        y = re.sub(r'=', '', y)
        ####### 11/5 added
        y = re.sub(r'\[\]', 'not ', y)
        y = re.sub(r'\[x\]', '', y)
        y = re.sub(r':', ': ', y)
        y = re.sub(r'\.', '. ', y)
        y = re.sub(r'assessment \& plan:', 'assessment:', y)
        y = re.sub(r'vitals:', 'vital signs', y)
        y = re.sub(r'active and recently expired inpatient medications \(including supplies\):', 'medications:', y)
        y = re.sub(r'active outpatient medications \(including supplies\):', 'medications:', y)
        y = re.sub(r'active inpatient medications \(including supplies\):', 'medications:',  y)
        y = re.sub(r'active outpatient medications \(excluding supplies\):', 'medications:', y)
        y = re.sub(r'reason for visit \(chief complaint\):', 'cc:', y)
        y = re.sub(r'lab results:', 'labs:', y)
        # very specific ones
        y = re.sub(r'med reconciliation  included in this list:', 'medications:', y)
        y = re.sub(r'51 y/o wm who', 'hpi: 51 y/o wm who', y)
        y = re.sub(r'reason for visit \(cc\):', 'cc:', y)
        y = re.sub(r'gen:', 'general:', y)
        y = re.sub(r'68 year old male appears', 'general: 68 year old male appears', y)
        y = re.sub(r'cc-', 'cc:', y)
        y = re.sub(r'hpi-', 'hpi:', y)
        y = re.sub('  +', ' ', y)  # whitespace
    return y

In [None]:
pd.set_option('display.max_colwidth', 150)
mod_texts['ReportText'] = mod_texts['ReportText'].map(preprocessing)
mod_texts

In [16]:
nomod_texts['ReportText'] = nomod_texts['ReportText'].map(preprocessing)

## Build Bag Of Ngrams Of Cuis Vector

In [18]:
import spacy
import medspacy
from medspacy.visualization import visualize_ent, visualize_dep
from medspacy.custom_tokenizer import create_medspacy_tokenizer
from medspacy.section_detection import Sectionizer
from medspacy.section_detection import SectionRule
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd

In [19]:
# load nlp model.
nlp = medspacy.load("en_core_sci_sm")
nlp.disable_pipe('parser')
nlp.disable_pipe('medspacy_target_matcher') # matcher: disable warning, ok doesn't help.

In [20]:
sectionizer = nlp.add_pipe("medspacy_sectionizer", config={"rules": "default"})
section_patterns = [
    SectionRule(category="history",literal="past medical/surgical history:"),
    SectionRule(category="history",literal="medical history:"),
    SectionRule(category="history",literal="surgical history:"),
]
sectionizer.add(section_patterns)

nlp.pipe_names

['tok2vec',
 'tagger',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'medspacy_pyrush',
 'medspacy_context',
 'medspacy_sectionizer']

Extract sections Other than Medications

In [21]:
def getSection(text):
    filteredtext = ""
    current_doc = nlp(text)  
    for title, body in zip(current_doc._.section_titles, current_doc._.section_bodies):
        if title.text != 'medications:':
            filteredtext += body.text
            filteredtext += " "
    return filteredtext

In [22]:
mod_texts['ReportText'] = mod_texts['ReportText'].map(getSection)
nomod_texts['ReportText'] = nomod_texts['ReportText'].map(getSection)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [42]:
def getAllCuis(df):
    note_to_cuis = defaultdict(list)
    for index, row in df.iterrows():
        current_doc = nlp(preprocessing(row['ReportText']))
        for entity in current_doc.ents:
            note_to_cuis[index] += [entity.lemma_] # text -> lemma
    return note_to_cuis

In [43]:
mod_to_cui_list = getAllCuis(mod_texts)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [44]:
nomod_to_cui_list = getAllCuis(nomod_texts)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [46]:
print(mod_to_cui_list[0])

['acknowledge', 'armband', 'dizziness', 'nausea & vomiting', 'diagnosis', 'lacunar pontine cva', 'patient', 'answer', 'nka code status', 'chronic', 'delta-agent', 'dm', 'diabetes mellitus', 'diabete', 'diabetic neuropathy', 'inguinal hernia', 'hyperglycemia', 'hyperlipidemia', 'hyperlipidemia', 'retinoph', 'memory loss', 'diagnosis', 'condition', 'axis', 'v09', 'chronic pain', "peyronie's disease", 'sensorineural hearing loss', 'bilateral', 'adjustment reaction', 'patient care status', 'self-care', 'isolation', 'standard telemetry', 'cardiac rhythm', 'oxygen', 'imaging test', 'scan', 'mri critical n', 'fingerstick', 'fingerstick', 'patient', 'food', 'pende order', 'pertinent', 'intervention', 'pain score', 'pulse oximetry', 'peripheral', 'saline lock', 'rac', 'forearm', 'size', 'condition', 'clean', 'dry', 'intact', 'central line', 'limit', 'mental status alert', 'orient respiratory normal', 'gastrointestinal', 'nausea', 'genito', 'urinary void', 'skin dry', 'intact', 'musculoskeletal 

In [None]:
mod_texts['ReportText'][0]

In [50]:
def generate_ngrams(words, ngram_size, output_dict):
    for i in range(len(words) - ngram_size + 1):
        output_dict[' '.join(words[i:i+ngram_size])] += 1
    return output_dict

In [51]:
import collections

mod_2gram = collections.Counter()
mod_3gram = collections.Counter()

for index, cui_list in mod_to_cui_list.items():
    mod_2gram = generate_ngrams(cui_list, 2, mod_2gram)
    mod_3gram = generate_ngrams(cui_list, 3, mod_3gram)

In [52]:
nomod_2gram = collections.Counter()
nomod_3gram = collections.Counter()

for index, cui_list in nomod_to_cui_list.items():
    nomod_2gram = generate_ngrams(cui_list, 2, nomod_2gram)
    nomod_3gram = generate_ngrams(cui_list, 3, nomod_3gram)

In [53]:
all2grams = list(set(mod_2gram) | set(nomod_2gram))
all3grams = list(set(mod_3gram) | set(nomod_3gram))

In [None]:
print(all2grams[:10])

## 2grams

In [57]:
# make Cui to index dictionary.
cuiToIndex = {}
for index, cui in enumerate(all2grams):
    cuiToIndex[cui] = index

In [58]:
import numpy as np
bagOfCuisVector = np.zeros((len(mod_texts)+len(nomod_texts), len(all2grams)))

In [71]:
print(len(cuiToIndex))

22591


In [65]:
def populateBOC2(bagOfCuisVector, noteToCuis, shifter=0):
    for noteIndex, cuiList in noteToCuis.items():
        for i in range(len(cuiList)-1):
            bagOfCuisVector[noteIndex + shifter][cuiToIndex[cuiList[i]+" "+cuiList[i+1]]] = 1
    return bagOfCuisVector

In [72]:
bagOfCuisVector = populateBOC2(bagOfCuisVector, mod_to_cui_list)
bagOfCuisVector = populateBOC2(bagOfCuisVector, nomod_to_cui_list, len(mod_to_cui_list))

In [73]:
print("number of cuis extracted in each notes:\n", bagOfCuisVector.sum(1))

number of cuis extracted in each notes:
 [ 75.  37. 137. 275. 279. 322. 247.  80. 180. 163. 420. 133. 122.  12.
  72.  58. 267.  71.  64. 110.  45. 246.  45. 148.  98.  50. 152.  35.
 171.  80.   5.  12. 396.   6.  70.  87. 230.  37.  72.   6.  13.  64.
 133. 145. 192.   7. 106.  12. 412.  27.  26. 266.  26.  17.  47.   8.
  42. 376.  29.   7.  67. 173. 173.  39. 133. 116.  66.  20.  74.   4.
   1. 168.   8.  51. 252. 212. 600. 302. 124.  43. 226.  52. 216.  35.
 128.   5.  47.   7. 156.  12.  17. 138. 183.  82. 193. 127.   7.  27.
 365.  46. 417.  90.  48.  22. 306. 469.  79. 269. 498.  21. 413. 159.
 200.  88.  36. 237.  34. 712.  42.   7. 119.  64. 316. 700. 187.  57.
  41. 342. 122.  78.  21.  23. 331.  91. 208. 148.  26. 378.  62. 210.
 127.  38. 290. 141. 295. 113.   0. 167.  14. 369. 535. 208. 169.  12.
  25.   7.  51. 159.   8. 255. 221. 122. 257.  69. 256. 100. 271. 123.
  54. 631. 285. 138. 133. 114.  93.  25. 182.  16. 220. 102. 105.  58.
  57. 204.   9.]


In [74]:
modVector = bagOfCuisVector[:len(mod_texts)]
nomodVector = bagOfCuisVector[len(nomod_texts):]

## Hamming Distance Distribution

In [77]:
from scipy.spatial.distance import hamming

In [78]:
def getClosestNoteIndex(standardNoteIndex):
    standard = bagOfCuisVector[standardNoteIndex]
    minHamming = float('inf')
    closestVecIndex = 0
    for i in range(len(bagOfCuisVector)): # 185
        if i != standardNoteIndex:
            vector = bagOfCuisVector[i]
            curHamming = hamming(standard, vector) * len(standard) # percentage of correspoinding elements that differ * len
            if curHamming < minHamming:
                closestVecIndex = i
                minHamming = curHamming
    return closestVecIndex, minHamming

In [79]:
closestIndex = np.zeros(185)
HD = np.zeros(185)
for note in range(185):
    closest, minH = getClosestNoteIndex(note)
    closestIndex[note] = closest
    HD[note] = minH

In [80]:
modClosestToMod = sum([1 if i < 87 else 0 for i in closestIndex[:87]])
nomodClosestToNomod = sum([1 if i >= 87 else 0 for i in closestIndex[87:]])
print(modClosestToMod, "/ 87 of mod are close to mod,", nomodClosestToNomod, "/ 98 of nomod are close to nomod")

0 / 87 of mod are close to mod, 96 / 98 of nomod are close to nomod
