## extract needed data - Physician or ED notes, last entry per (person, note type) pair

In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import dizzy_util as util
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 150)

In [3]:
dataset = util.extractDataset("B00_ML4TrgPos_", {"WithRole", "AllMed", "Vital", "HF", "DispensedDrug"})



Unnamed: 0,TABLE_CATALOG,TABLE_SCHEMA,TABLE_NAME,TABLE_TYPE
0,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_01_04_cohort,BASE TABLE
1,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_05_04_Rad,BASE TABLE
2,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_30_Note_WithRole,BASE TABLE
3,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_30_Note,BASE TABLE
4,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_04_04_Lab,BASE TABLE
5,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_04_RxOutpat,BASE TABLE
6,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_07_NonVAMed,BASE TABLE
7,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_05_BCMA_Dispense...,BASE TABLE
8,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_06_Med_08_AllMed,BASE TABLE
9,ORD_Singh_201911038D,Dflt,_B00_ML4TrgPos_Y201621_08_Consult,BASE TABLE




dict_keys(['cohort', 'Rad', 'Note', 'Lab', 'RxOutpat', 'NonVAMed', 'Consult', 'ICD', 'new', 'Demorgraphics'])




In [4]:
dizziness_df, label_map = util.retrieveLabels() # labeled
label_df = util.convertLabelMap(label_map) # PatientSSN | Label (100)
datasubset = util.extractAndStandarizeCohort(dataset, "Dizziness") # dataset for dizziness (All).
edstart = util.extractFirstVisitIndexDatetime(datasubset["cohort"]) # get first entry of each patient.

In [5]:
dizziness_df["DxErrorERCoded"].value_counts()

NoMOD       44
MOD         38
PMOD        12
CodingEr     6
Name: DxErrorERCoded, dtype: int64

In [6]:
datasubset["cohort"] = datasubset["cohort"].merge(label_df.rename(columns={"PatientSSN":"patientSSN"})) # concat label

In [7]:
def extractSubset(ds, subsetNames):
    pkeys = util.getPrimaryKeys(dataset)
    cohort_key = pkeys["cohort"]
    cohort_subset = ds["cohort"][ds["cohort"].Label.isin(subsetNames)]
    ids = pd.DataFrame({cohort_key: cohort_subset[cohort_key].unique()})
    dss = dict()
    for table, df in ds.items():
        dss[table] = ids.merge(df, how="inner", left_on=cohort_key, right_on=pkeys[table])
        if pkeys[table] != cohort_key:
            dss[table] = dss[table].drop([cohort_key], axis = 1)
        dss[table][pkeys[table]] = dss[table][pkeys[table]].astype(int)
    return dss

In [8]:
modsubset = extractSubset(datasubset, ["MOD"]) # dizziness MOD cases
nomodsubset = extractSubset(datasubset, ["NoMOD"]) # dizziness no-MOD cases

In [9]:
print(modsubset['Note'].columns)

Index(['PatientSSN', 'EntryDateTime', 'TIUDocumentSID', 'TIUStandardTitle',
       'ReportText', 'Sta3n', 'PatientSID', 'ProviderRole'],
      dtype='object')


In [10]:
def getEDorPhysician(df):
    pattern = 'EMERGENCY DEPT NOTE|PHYSICIAN NOTE'
    return df[df.TIUStandardTitle.str.contains(pattern)]

In [11]:
mod_ed_or_physician_df = getEDorPhysician(modsubset['Note']).sort_values("EntryDateTime")

In [12]:
nomod_ed_or_physician_df = getEDorPhysician(nomodsubset['Note']).sort_values("EntryDateTime")

In [13]:
print(mod_ed_or_physician_df['TIUStandardTitle'].unique())

['NURSING EMERGENCY DEPT NOTE' 'EMERGENCY DEPT NOTE'
 'PRIMARY CARE PHYSICIAN NOTE' 'PHYSICAL MEDICINE REHAB PHYSICIAN NOTE'
 'ATTENDING EMERGENCY DEPT NOTE' 'MENTAL HEALTH PHYSICIAN NOTE'
 'PHYSICIAN NOTE' 'NEUROLOGY PHYSICIAN NOTE'
 'PHYSICIAN EMERGENCY DEPT NOTE' 'UROLOGY PHYSICIAN NOTE'
 'SCANNED EMERGENCY DEPT NOTE' 'PULMONARY PHYSICIAN NOTE'
 'PHYSICAL THERAPY PHYSICIAN NOTE' 'ANESTHESIOLOGY PHYSICIAN NOTE'
 'DERMATOLOGY PHYSICIAN NOTE' 'PALLIATIVE CARE PHYSICIAN NOTE'
 'SOCIAL WORK EMERGENCY DEPT NOTE' 'DIALYSIS PHYSICIAN NOTE'
 'INTERNAL MEDICINE PHYSICIAN NOTE']


In [14]:
from collections import defaultdict
def getLastEntryOfNote(df):
    idnote_to_note = defaultdict(str)
    for index, row in df.iterrows():
        idnote_to_note[(row['PatientSSN'], row['TIUStandardTitle'])] = row['ReportText']
    return idnote_to_note

In [16]:
filtered_mod_dict = getLastEntryOfNote(mod_ed_or_physician_df)
filtered_nomod_dict = getLastEntryOfNote(nomod_ed_or_physician_df)

In [17]:
filtered_mod_df = pd.DataFrame(filtered_mod_dict.values(), columns =["ReportText"])
filtered_nomod_df = pd.DataFrame(filtered_nomod_dict.values(), columns =["ReportText"])

In [18]:
mod_texts = filtered_mod_df.reset_index(drop=True)
nomod_texts = filtered_nomod_df.reset_index(drop=True)

In [51]:
print(len(mod_texts))
print(len(nomod_texts))

87
98


In [19]:
import re
def preprocessing(text):
    y = ''
    if text:
        y = text.lower()
        y = re.sub(r'\\[(.*?)\\]', '', y)
        y = re.sub(r'[0-9]+\.', '', y)
        y = re.sub(r'dr\.', 'doctor', y)
        y = re.sub(r'm\.d\.', 'md', y)
        y = re.sub(r'admission date:', '', y)
        y = re.sub(r'discharge date:', '', y)
        y = re.sub(r'--|__|==', '', y) 
        y = re.sub(r'y\.o\.', 'year old', y)
        y = re.sub(r'fh', 'family history', y)
        y = re.sub(r'sh:', 'social history:', y)
        y = re.sub(r'\r\n', '', y)
        y = re.sub(r' :', ':', y)
        y = re.sub(r'physical examination', 'physical exam', y)
        ######### prevent catching as section
        y = re.sub(r'consult', 'consultation', y)
        y = re.sub(r'allergies', 'allergy', y) # allergies: stay the same but allergie -> allergy
        y = re.sub(r'allergy:', 'allergies:', y)
        y = re.sub(r'past history', 'past histories', y) # past history: stay the same but history -> histories, not ran for mod.
        y = re.sub(r'past histories:', 'past history:', y) # not ran for mods.
        y = re.sub(r'/ ', '/', y)
        y = re.sub(r'plan/disposition', 'plan', y)
    return y

In [None]:
pd.set_option('display.max_colwidth', 150)
mod_texts['ReportText'] = mod_texts['ReportText'].map(preprocessing)
mod_texts

In [None]:
nomod_texts['ReportText'] = nomod_texts['ReportText'].map(preprocessing)
nomod_texts

## CUI Extraction, on all notes printed above

In [22]:
import spacy
import medspacy
from medspacy.visualization import visualize_ent, visualize_dep
from medspacy.custom_tokenizer import create_medspacy_tokenizer
from medspacy.section_detection import Sectionizer
from medspacy.section_detection import SectionRule
from quickumls import QuickUMLS
from collections import defaultdict
import matplotlib.pyplot as plt
import pandas as pd
import re

In [23]:
# load nlp model.
nlp = medspacy.load("en_core_sci_sm")
nlp.disable_pipe('parser')
nlp.disable_pipe('medspacy_target_matcher') # matcher: disable warning, ok doesn't help.

In [24]:
sectionizer = nlp.add_pipe("medspacy_sectionizer", config={"rules": "default"})
section_patterns = [
    SectionRule(category="history",literal="past medical/surgical history:"),
    SectionRule(category="history",literal="medical history:"),
    SectionRule(category="history",literal="surgical history:"),
]
sectionizer.add(section_patterns)

nlp.pipe_names

['tok2vec',
 'tagger',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'medspacy_pyrush',
 'medspacy_context',
 'medspacy_sectionizer']

In [25]:
def countCUIs(df):
    cui_and_count = defaultdict(int)
    for index, row in df.iterrows():
        current_doc = nlp(preprocessing(row['ReportText']))
        for entity in current_doc.ents:
            cui_and_count[entity.text] += 1
    return cui_and_count

In [26]:
def printWordFreq(cui_and_count_dict):
    cui_and_count_df = pd.DataFrame(cui_and_count_dict.items(), columns = ['word', 'count'] )
    cui_and_count_df = cui_and_count_df.sort_values('count', ascending = False)
    return cui_and_count_df
    # plt.bar('word', 'count', data=cui_and_count_df)    

Without Sectionizer

In [27]:
mod_cuis = countCUIs(mod_texts)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [28]:
nomod_cuis = countCUIs(nomod_texts)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [29]:
pd.set_option('display.max_rows', 100)
printWordFreq(mod_cuis)[:100]

Unnamed: 0,word,count
7,patient,322
168,tab,316
164,day,297
176,tablet,290
163,mouth,253
385,days,160
170,active,134
1251,/,99
157,medications,92
435,supply,68


In [30]:
printWordFreq(nomod_cuis)[:100]

Unnamed: 0,word,count
77,patient,438
241,day,387
229,tablet,364
230,mouth,336
228,tab,331
472,days,183
807,medications,114
251,active,106
809,expiration,86
4005,supply,86


## With sectionizer

In [31]:
interestedSections = ['hpi:', 'history:', 'cc:', 'chief complaint:', 'clinical history:', 'history of present illness:', 
                      'clinical history:', 'vital signs', 'physical exam:', 'neuro:', 'pe:', 'impression:', 'assessment:', 
                      'impression and plan:', 'final diagnosis:', 'secondary diagnosis:', 'assessment/plan:', 
                      'clinical impression:']

In [32]:
def getSection(text):
    filteredtext = ""
    current_doc = nlp(text)             
    for title, body in zip(current_doc._.section_titles, current_doc._.section_bodies):
        if title.text in interestedSections:
            filteredtext += body.text
            filteredtext += " "
    return filteredtext

In [33]:
mod_texts['ReportText'] = mod_texts['ReportText'].map(getSection)
nomod_texts['ReportText'] = nomod_texts['ReportText'].map(getSection)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [34]:
mod_sec_cuis = countCUIs(mod_texts)
nomod_sec_cuis = countCUIs(nomod_texts)

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches 

  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)
  matches = self.matcher(doc)


In [35]:
printWordFreq(mod_sec_cuis)[:100]

Unnamed: 0,word,count
121,tab,128
15,patient,124
129,tablet,124
116,mouth,113
117,day,111
284,days,102
123,active,54
338,expiration,52
336,released,47
337,supply,39


In [36]:
printWordFreq(nomod_sec_cuis)[:100]

Unnamed: 0,word,count
45,patient,183
233,day,77
940,tab,72
944,tablet,68
938,mouth,56
180,pain,41
592,medications,32
73,intact,30
270,denies,29
65,time,27
