In [1]:
import pandas as pd
from sklearn.externals import joblib
import re
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict
import operator
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn import decomposition
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
base_path = r'D:\ORGANIZATION\UCSD_Life\Work\4. Quarter-3\Subjects\MED 277\Project\DATA\\'
data_file = base_path+"NOTEEVENTS.csv.gz"

In [9]:
df1 =  joblib.load(base_path+'data10.pkl')
df = df1[:50]

In [18]:
df = df.loc[df['CATEGORY'] == 'Discharge summary'] #Extracting only discharge summaries
df_text = df['TEXT']

##### START rough work

In [114]:
df_text[4]



In [165]:
print(df_text[0])

Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]


Service:
ADDENDUM:

RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of
T10 and sacrum most likely secondary to osteoporosis. These can
be followed by repeat imaging as an outpatient.



                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]

Dictated By:[**Hospital 1807**]
MEDQUIST36

D:  [**2151-8-5**]  12:11
T:  [**2151-8-5**]  12:21
JOB#:  [**Job Number 1808**]



In [158]:
def get_regex_match(regex, str_arg):
    srch = re.search(regex,str_arg)
    if srch is not None:
        return srch.group(0).strip()
    else:
        return "Not found"

In [168]:
def extract(key,str_arg):
    if key == 'dob':
        return get_regex_match('Date of Birth:(.*)] ', str_arg)
    elif key == 'a_date':
        return get_regex_match('Admission Date:(.*)] ', str_arg)
    elif key == 'd_date':
        return get_regex_match('Discharge Date:(.*)]\n', str_arg)
    elif key == 'sex':
        return get_regex_match('Sex:(.*)\n', str_arg)
    elif key == 'service':
        return get_regex_match('Service:(.*)\n', str_arg)
    elif key == 'allergy':
        return get_regex_match('Allergies:(.*)\n(.*)\n', str_arg)
    elif key == 'attdng':
        return get_regex_match('Attending:(.*)]\n', str_arg)
    else:
        return "I Don't know"

## Topic Extraction

##### START rough work

In [170]:
st = "What is my admission date?".split(".")

In [172]:
vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english')
dtm = vectorizer.fit_transform(st)
vocab = np.array(vectorizer.get_feature_names())

In [179]:
num_topics = 1
num_top_words = 3

#clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction
clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online')
doctopic = clf.fit_transform(dtm)

topic_words = []
for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words] ##[::-1] reverses the list
    topic_words.append([vocab[i] for i in word_idx])

In [180]:
topic_words

[['admission', 'date']]

In [192]:
port = PorterStemmer()
port.stem('date')

'date'

In [193]:
wnl = WordNetLemmatizer()
wnl.lemmatize('date')

'date'

# Scratch Code

Idea 1: Find topic using LDA for both questions and answers. Return the answer that matches the question topic most closely.

Assumption : Person will always ask question

In [367]:
'''This method extracts topic from sentence'''
def extract_topic(str_arg, num_topics = 1, num_top_words = 3):
    vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english')
    dtm = vectorizer.fit_transform(str_arg.split())
    vocab = np.array(vectorizer.get_feature_names())
    
    #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction
    clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online')
    clf.fit_transform(dtm)
    
    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words] ##[::-1] reverses the list
        topic_words.append([vocab[i] for i in word_idx])
    return topic_words

In [366]:
'''This method extracts topics in a question'''
def extract_Q_topic(str_arg):
    return extract_topic(str_arg)
    ## TODO fix later for more comprehensive results

In [368]:
## A Stemmed mapping for simple extractions
extract_map = {'birth':'dob', 'dob':'dob',
              'admiss':'a_date', 'discharg':'d_date',
              'sex':'sex', 'gender':'sex', 'servic':'service',
              'allergi':'allergy', 'attend':'attdng'}

In [369]:
'''Method that gets stemmed mapping for information extraction'''
def get_extract_map(key):
    k = None
    try:
        k = extract_map[key]
    except:
        pass
    return k

In [376]:
'''Method that generates the answer for text extraction questions'''
def get_extracted_answer(topic_str, text):
    port = PorterStemmer()
    for i in range(0, len(topic_str)):
        rel_wrd = topic_str[i]
        for wrd in rel_wrd:
            key = get_extract_map(port.stem(wrd))
            if key is not None:
                return extract(key, text)
    return None

In [241]:
def get_answer(topic):
    ## Maybe apply lemmatizer here
    pass

In [371]:
'''Method that processes the entire document string'''
def process_text(txt):
    txt1 = re.sub('[\n]'," ",txt)
    txt1 = re.sub('[^A-Za-z \.]+', '', txt1)
    
    return txt1

In [372]:
'''Method that processes raw string and gets a processes list containing lines'''
def get_processed_sentences(snt_txt):
    snt_list = []
    for line in snt_txt.split('.'):
        line = line.strip()
        if len(line.split()) >= 5:
            snt_list.append(line)
    return snt_list

In [374]:
'''This method extracts topics of each sentence and generates a map of topics to sentences for possible answers'''
def extract_sentence_topics(doc_string):
    #One entry per sentence in list
    #Map topics to sentence
    doc_str = process_text(doc_string)
    doc_str = get_processed_sentences(doc_str)
    
    topic_map = defaultdict(list)
    for i in range (0, len(doc_str)):
        snd_str = doc_str[i].lower()
        #print("Sending ----------------------------",snd_str,"==========",len(snd_str))
        tmp_topic = extract_topic(snd_str, num_topics = 1, num_top_words = 1)
        for val in tmp_topic[0]:
            topic_map[val].append(i)
    return doc_str, topic_map

In [None]:
def generate_Q_embedding(topic):
    pass

In [None]:
def generate_A_embedding(topic):
    pass

In [None]:
def get_QA_similarity(ques, ans):
    pass

## The Main Function

In [383]:
def main_bot():
    while True:
        pid = 4 ## Update this with patient identifier here
        ip = "What is my dob?" ## Read input here
        topic_q = extract_Q_topic(ip)
        ans = get_extracted_answer(topic_q, df_text[4])
        if ans is not None:
            print("Bot:>",ans)
        else:
            print("Enough information is not found in the clinical notes!")
        break

In [384]:
main_bot()

Bot:> Date of Birth:  [**2080-1-4**]
