In [149]:
import pandas as pd
from sklearn.externals import joblib
import re
from nltk.stem.snowball import SnowballStemmer
from collections import defaultdict
import operator
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn import decomposition
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.decomposition import PCA
from numpy.linalg import norm

In [2]:
base_path = r'D:\ORGANIZATION\UCSD_Life\Work\4. Quarter-3\Subjects\MED 277\Project\DATA\\'
data_file = base_path+"NOTEEVENTS.csv.gz"

In [3]:
df1 =  joblib.load(base_path+'data10.pkl')
df = df1[:50]

In [4]:
df = df.loc[df['CATEGORY'] == 'Discharge summary'] #Extracting only discharge summaries
df_text = df['TEXT']

## EXTRACT ALL THE TOPICS

In [5]:
'''Method that processes the entire document string'''
def process_text(txt):
    txt1 = re.sub('[\n]'," ",txt)
    txt1 = re.sub('[^A-Za-z \.]+', '', txt1)
    
    return txt1

In [6]:
'''Method that processes the document string not considering separate lines'''
def process(txt):
    txt1 = re.sub('[\n]'," ",txt)
    txt1 = re.sub('[^A-Za-z ]+', '', txt1)
    
    _wrds = txt1.split()
    stemmer = SnowballStemmer("english") ## May use porter stemmer
    wrds = [stemmer.stem(wrd) for wrd in _wrds]
    return wrds

In [7]:
'''Method that processes raw string and gets a processes list containing lines'''
def get_processed_sentences(snt_txt):
    snt_list = []
    for line in snt_txt.split('.'):
        line = line.strip()
        if len(line.split()) >= 5:
            snt_list.append(line)
    return snt_list

In [8]:
'''This method extracts topic from sentence'''
def extract_topic(str_arg, num_topics = 1, num_top_words = 3):
    vectorizer = text.CountVectorizer(input='content', analyzer='word', lowercase=True, stop_words='english')
    dtm = vectorizer.fit_transform(str_arg.split())
    vocab = np.array(vectorizer.get_feature_names())
    
    #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction
    clf = decomposition.LatentDirichletAllocation(n_components=num_topics, learning_method='online')
    clf.fit_transform(dtm)
    
    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words] ##[::-1] reverses the list
        topic_words.append([vocab[i] for i in word_idx])
    return topic_words

In [9]:
'''This method extracts topics of each sentence and returns a list'''
def extract_topics_all(doc_string):
    #One entry per sentence in list
    doc_str = process_text(doc_string)
    doc_str = get_processed_sentences(doc_str)
    
    res = []
    for i in range (0, len(doc_str)):
        snd_str = doc_str[i].lower()
        #print("Sending ----------------------------",snd_str,"==========",len(snd_str))
        tmp_topic = extract_topic(snd_str, num_topics = 2, num_top_words = 1)
        for top in tmp_topic:
            for wrd in top:
                res.append(wrd)
    return res

In [10]:
'''This function takes a dataframe and returns all the topics in the entire corpus'''
def extract_corpus_topics(arg_df):
    all_topics = set()
    cnt = 1
    for txt in arg_df:
        all_topics = all_topics.union(extract_topics_all(txt))
        print("Processed ",cnt," records")
        cnt += 1
    all_topics = list(all_topics)
    return all_topics

## GET A VECTORIZED REPRESENTATION OF ALL THE TOPICS

In [11]:
'''data_set = words list per document.
    vocabulary = list of all the words present
    _vocab = dict of word counts for words in vocabulary'''
def get_vocab_wrd_map(df_text):
    data_set = []
    vocabulary = []
    _vocab = defaultdict(int)
    for i in range(0,df_text.size):
        txt = process(df_text[i])
        data_set.append(txt)

        for wrd in txt:
            _vocab[wrd] += 1

        vocabulary = vocabulary + txt
        vocabulary = list(set(vocabulary))

        if(i%100 == 0):
            print("%5d records processed"%(i))
    return data_set, vocabulary, _vocab

In [50]:
'''vocab = return sorted list of most common words in vocabulary'''
def get_common_vocab(num_arg, vocab):
    vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
    vocab = vocab[:num_arg]
    return vocab

In [51]:
'''Convert vocabulary and most common words to map for faster access'''
def get_vocab_map(vocabulary, vocab):
    vocab_map = {}
    for i in range(0,len(vocab)):
        vocab_map[vocab[i][0]] = i 
    
    vocabulary_map = {}
    for i in range(0,len(vocabulary)):
        vocabulary_map[vocabulary[i]] = i
    
    return vocabulary_map, vocab_map

In [52]:
#data_set = each document split by spaces. Contains list of list of words
#vocabulary = contains all the unique words in all the documents.
#_vocab = contains the count of all the words in the vocabulary
data_set, vocabulary, _vocab = get_vocab_wrd_map(df_text)

    0 records processed


In [53]:
#vocab = cotnains the most frequent words as a list of tuples (words, count)
vocab = get_common_vocab(1000, _vocab)

In [54]:
#vocabulary_map = contains the index mapping of compete vocabulary
#vocab_map = contains the index mapping of context words.
vocabulary_map, vocab_map = get_vocab_map(vocabulary, vocab)

In [80]:
def get_embedding(word, data_set, vocab_map, wdw_size):
    embedding = [0]*len(vocab_map)
    for docs in data_set:
        for i in range(wdw_size, len(docs)-wdw_size):
            if docs[i] == word:
                for j in range(i-wdw_size, i-1):
                    if docs[j] in vocab_map:
                        embedding[vocab_map[docs[j]]] += 1
                for j in range(i+1, i+wdw_size):
                    if docs[j] in vocab_map:
                        embedding[vocab_map[docs[j]]] += 1
    total_words = sum(embedding)
    if total_words != 0:
        embedding[:] = [e/total_words for e in embedding]
    return embedding

In [73]:
#all_topics = contains all the topics extracted from entire data (all docs)
all_topics = extract_corpus_topics(df_text)

Processed  1  records
Processed  2  records
Processed  3  records
Processed  4  records
Processed  5  records
Processed  6  records
Processed  7  records
Processed  8  records
Processed  9  records
Processed  10  records
Processed  11  records
Processed  12  records
Processed  13  records
Processed  14  records
Processed  15  records
Processed  16  records
Processed  17  records
Processed  18  records
Processed  19  records
Processed  20  records
Processed  21  records
Processed  22  records
Processed  23  records
Processed  24  records
Processed  25  records
Processed  26  records
Processed  27  records
Processed  28  records
Processed  29  records
Processed  30  records
Processed  31  records
Processed  32  records
Processed  33  records
Processed  34  records
Processed  35  records
Processed  36  records
Processed  37  records
Processed  38  records
Processed  39  records
Processed  40  records
Processed  41  records
Processed  42  records
Processed  43  records
Processed  44  recor

In [89]:
def get_embedding_all(all_topics, data_set, vocab_map, wdw_size):
    embeddings = []
    for i in range(0, len(all_topics)):
        embeddings.append(get_embedding(all_topics[i], data_set, vocab_map, wdw_size))
    return embeddings

In [90]:
#embbeddings = all the topics transformed as per ngram model
embeddings = get_embedding_all(all_topics, data_set, vocab_map, 5)

### Dimensionality Reduction

In [101]:
pca = PCA(n_components=10)
embedding_short = pca.fit_transform(embeddings)
#print(pca.explained_variance_ratio_[:10]) 

[0.07208819 0.03403336 0.02948849 0.0287339  0.02751334 0.02716727
 0.02547945 0.01839411 0.01650039 0.01586124]


## GET A SIMILARITY FUNCTION

In [150]:
def cos_matrix_multiplication(matrix, vector):
    """
    Calculating pairwise cosine distance using matrix vector multiplication.
    """
    dotted = matrix.dot(vector)
    matrix_norms = np.linalg.norm(matrix, axis=1)
    vector_norm = np.linalg.norm(vector)
    matrix_vector_norms = np.multiply(matrix_norms, vector_norm)
    neighbors = np.divide(dotted, matrix_vector_norms)
    return neighbors

In [165]:
def get_most_similar_topics(embd, embeddings, all_topics, num_wrd=10):
    sim_top = []
    cos_sim = cos_matrix_multiplication(np.array(embeddings), embd)
    closest_match = cos_sim.argsort()[-num_wrd:][::-1]
    for i in range(0, closest_match.shape[0]):
        sim_top.append(all_topics[closest_match[i]])
    return sim_top

In [167]:
get_most_similar_topics(embedding_short[2], embedding_short, all_topics, num_wrd = len(all_topics))

['sensory',
 'cancelled',
 'schedule',
 'cycled',
 'bronchodilator',
 'shortly',
 'hypoxic',
 'limits',
 'consumption',
 'neurologically',
 'receive',
 'shaky',
 'plans',
 'relatively',
 'ducts',
 'segments',
 'prognosis',
 'calcifications',
 'lactate',
 'constriction',
 'wounds',
 'prophylactic',
 'requested',
 'phenylephrine',
 'premature',
 'goes',
 'nabs',
 'tubes',
 'titrated',
 'immediately',
 'drinks',
 'bleedpeptic',
 'infections',
 'meds',
 'drains',
 'organisms',
 'upcoming',
 'complications',
 'stools',
 'happens',
 'complains',
 'cerebral',
 'questions',
 'softener',
 'moving',
 'sedation',
 'sats',
 'worked',
 'information',
 'discharged',
 'contine',
 'leaflets',
 'structures',
 'blocks',
 'nontender',
 'dehiscence',
 'definite',
 'recovery',
 'drips',
 'tightness',
 'hemodialysis',
 'activity',
 'coronary',
 'hypothryoidism',
 'perscribed',
 'trivialphysiologic',
 'wegener',
 'quickly',
 'obtained',
 'fluids',
 'broadening',
 'attributed',
 'candidiasis',
 'wrapping',
 '