In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import gammaln
from collections import Counter
from textblob import TextBlob

import pickle
import gensim
import imp
import lda2
import scipy
import operator
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle

In [2]:
def processReviews(reviews, window=5, MAX_VOCAB_SIZE=1000):
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None)
    train_data_features = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    wordOccurenceMatrix = train_data_features.toarray()
    return wordOccurenceMatrix, vocabulary, words

def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [3]:
def coherence_score(X, topic_sentiment_df):
    X[X>1] = 1    
    totalcnt = len(topic_sentiment_df)
    total = 0
    for allwords in topic_sentiment_df:
        for word1 in allwords:
            for word2 in allwords:
                if word1 != word2:
                    ind1 = vocabulary[word1]
                    ind2 = vocabulary[word2]
                    total += np.log((np.matmul(X[:,ind1].T, X[:,ind2]) + 1.0)/np.sum(X[:,ind2]))
    return total/(2*totalcnt)

def kl_score(pk,qk):
    return (scipy.stats.entropy(pk,qk)*.5 + scipy.stats.entropy(qk,pk)*.5)

def get_hscore(dt_distribution, X, k):
    testlen = X.shape[0]
    all_kl_scores = np.zeros((testlen, testlen))
    for i in range(testlen-1):
        for j in range(i+1,testlen):
            score = kl_score(dt_distribution[i],dt_distribution[j])
            all_kl_scores[i,j] = score
            all_kl_scores[j,i] = score

    dt = np.zeros((X.shape[0], k))

    for i in range(X.shape[0]):
        dt[i, dt_distribution[i].argmax()]=1

    intradist = 0
    for i in range(k):
        cnt = dt[:,i].sum()
        tmp = np.outer(dt[:,i],dt[:,i])
        tmp = tmp * all_kl_scores
        intradist += tmp.sum()*1.0/(cnt*(cnt-1))
#         print(cnt, tmp.sum(), intradist)
    intradist = intradist/k
    

    interdist = 0
    for i in range(k):
       for j in range(k):
           if i != j:
             cnt_i = dt[:,i].sum()
             cnt_j = dt[:,j].sum()
             tmp = np.outer(dt[:,i], dt[:,j])
             tmp = tmp * all_kl_scores
             interdist += tmp.sum()*1.0/(cnt_i*cnt_j)
    interdist = interdist/(k*(k-1))
    return intradist/interdist

In [4]:
%%time
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 2min 15s, sys: 4.61 s, total: 2min 19s
Wall time: 2min 19s


In [5]:
icd_words = pd.read_csv("strict_medical_words.txt", header=None)[0].values.tolist()

In [6]:
# icd_words = pickle.load(open("unique_words.pickle",'rb'))

In [7]:
dataset = pd.read_pickle("dataset_cleaned_autocorrected")

In [8]:
dataset[8] = dataset[7].apply(lambda x: [item for item in x.split(" ") if item in icd_words])
dataset[9] = dataset[8].apply(lambda x: " ".join(x))

In [9]:
dataset.to_pickle("dataset_cleaned_autocorrected_strict_medical_only")

In [10]:
matrix, vocabulary, words = processReviews(dataset[9].values)

In [11]:
def grid_search(N_TOPICS, lambda_param, edges_threshold):
    k = N_TOPICS
    docs_edges = []
    count = 0
    ignored = []
    taken = []
    for idx, doc in enumerate(dataset[8].values):
        edges = []
        for i in doc:
            for j in doc:
                if i != j:
                    try:
                        a = embeddings_index[i]
                        b = embeddings_index[j]
                        if get_cosine(a, b) > edges_threshold:
                            edges.append((vocabulary[i], vocabulary[j]))
                    except:
                        try:
                            embeddings_index[i]
                            taken.append(i)
                        except:
                            ignored.append(i)
                        try:
                            embeddings_index[j]
                        except:
                            ignored.append(j)
                            taken.append(j)
                        pass
        docs_edges.append(edges)

    edge_dict = {}
    for i in docs_edges:
        for j in i:
            try:
                edge_dict[j[0]] += [j[1]]
            except:
                edge_dict[j[0]] = [j[1]]
    sampler = lda2.LdaSampler(n_topics=N_TOPICS, lambda_param=lambda_param)

    for it, phi in enumerate(sampler.run(matrix, edge_dict)):
        sampler.loglikelihood(docs_edges)

    t_words = sampler.getTopKWords(5, words)
    top_words = [t_words[i] for i in t_words.keys()]

    return coherence_score(matrix, top_words), get_hscore(sampler.theta(), matrix, k), t_words, sampler

In [34]:
N_TOPICS = [9]

In [35]:
edges_threshold = [0.3, 0.5, 0.7]

In [36]:
lambda_param = [0.3, 0.7, 1.0]

In [41]:
dump = []
for i in N_TOPICS:
    for j in edges_threshold:
        for k in lambda_param:
            dump.append((i, j, k, grid_search(N_TOPICS=i, edges_threshold=j, lambda_param=k)))

In [42]:
for idx, i in enumerate(dump):
    print("")
    print(idx, i[0], i[1], i[2], i[3][0], i[3][1])
    for j in i[3][2]:
        print(i[3][2][j])


(0, 9, 0.3, 0.3, -7.018596937541025, 0.1553310190629228)
[u'genetic', u'anesthesia', u'anesthetic', u'institution', u'schedule']
[u'lump', u'wound', u'skin', u'hiv', u'urine']
[u'urea', u'schedule', u'screening', u'preterm', u'sepsis']
[u'residual', u'angioplasty', u'proximal', u'artery', u'using']
[u'lung', u'allergy', u'oriented', u'urology', u'abdominal']
[u'pallor', u'routine', u'uterus', u'murmur', u'cyanosis']
[u'febrile', u'mucosa', u'multi', u'cell', u'global']
[u'cancer', u'diabetic', u'consciousness', u'cycle', u'event']
[u'trauma', u'restricted', u'wound', u'fever', u'knee']

(1, 9, 0.3, 0.7, -6.134952140520126, 0.16113032016033405)
[u'mechanical', u'generalized', u'transfusion', u'weakness', u'due']
[u'residual', u'angioplasty', u'proximal', u'using', u'artery']
[u'hydrocele', u'anesthesia', u'genetic', u'institution', u'anesthetic']
[u'routine', u'uterus', u'pallor', u'murmur', u'cyanosis']
[u'cell', u'multiple', u'hiv', u'skin', u'urine']
[u'allergy', u'lung', u'oriented

In [43]:
sam = dump[1]

In [75]:
sam[3][2]

{0: [u'mechanical', u'generalized', u'transfusion', u'weakness', u'due'],
 1: [u'residual', u'angioplasty', u'proximal', u'using', u'artery'],
 2: [u'hydrocele', u'anesthesia', u'genetic', u'institution', u'anesthetic'],
 3: [u'routine', u'uterus', u'pallor', u'murmur', u'cyanosis'],
 4: [u'cell', u'multiple', u'hiv', u'skin', u'urine'],
 5: [u'allergy', u'lung', u'oriented', u'surgical', u'abdominal'],
 6: [u'global', u'radiotherapy', u'node', u'carcinoma', u'cancer'],
 7: [u'diabetic', u'allergy', u'consciousness', u'cycle', u'event'],
 8: [u'stone', u'renal', u'preterm', u'platelet', u'respiratory']}

In [76]:
sampler = sam[3][3]

In [77]:
for idx, i in enumerate(sampler.theta().argmax(axis=1)):
    print(idx, i)

(0, 0)
(1, 0)
(2, 1)
(3, 2)
(4, 2)
(5, 2)
(6, 7)
(7, 7)
(8, 7)
(9, 7)
(10, 7)
(11, 7)
(12, 8)
(13, 6)
(14, 6)
(15, 6)
(16, 6)
(17, 6)
(18, 7)
(19, 7)
(20, 7)
(21, 7)
(22, 7)
(23, 8)
(24, 7)
(25, 7)
(26, 2)
(27, 2)
(28, 8)
(29, 8)
(30, 8)
(31, 2)
(32, 8)
(33, 8)
(34, 0)
(35, 2)
(36, 0)
(37, 8)
(38, 8)
(39, 2)
(40, 8)
(41, 4)
(42, 4)
(43, 4)
(44, 4)
(45, 4)
(46, 4)
(47, 4)
(48, 4)
(49, 4)
(50, 4)
(51, 0)
(52, 4)
(53, 7)
(54, 2)
(55, 5)
(56, 7)
(57, 8)
(58, 3)
(59, 5)
(60, 5)
(61, 5)
(62, 1)
(63, 5)
(64, 8)
(65, 0)
(66, 4)
(67, 5)
(68, 0)
(69, 5)
(70, 3)
(71, 3)
(72, 8)
(73, 8)
(74, 3)
(75, 3)
(76, 3)
(77, 3)
(78, 7)
(79, 6)
(80, 5)
(81, 5)
(82, 5)
(83, 5)
(84, 5)
(85, 6)
(86, 0)
(87, 1)
(88, 0)
(89, 1)
(90, 1)
(91, 1)
(92, 1)
(93, 1)
(94, 1)
(95, 0)
(96, 1)
(97, 0)
(98, 1)
(99, 1)


In [69]:
# 0 - weak
# 1 - heart stunt
# 2 - anesthetia
# 3 - pergnet
# 4 - HIV
# 5 - surgery
# 6 - cancer
# 7 - allergy
# 8 - stone

In [86]:
dataset[9][12]

u'unilateral ureter catheterization retrograde stone ureter unilateral abdomen ureteric ureteric fragment kidney'

In [87]:
print(dataset['Text'][12])

DEVKAMAL HOSPITAL AND RESEARCH CENTRE
Case No. CASE/HOSP20 P07056/P25187
Package Billed: DJ stent unilateral including cystoscopy, ureteric catheterization, retrograde pyelogram(S700055)
Ureteroscopy+stone removal with lithotripsy, upper ureter, unilateral(S700040)
DEVKAMAL
HOSPITAL देवगन ॐास्पिटल
DEVKAMAL HOSPITAL & RESEARCH CENTRE
(SUPER SPECIALITY HOSPITAL) Bajra Near ITI BusStand Ranchi-834005[Jharichand) -9204055638,07549000111 - www.devkamalhospital.com
IPD/18-19/01698 ASHA RANI
IPO No Patient Name Father/Spouse Company Doctor City
DISCHARGE SUMMARY
Receipt No Age/Gender Admission Date Discharge Date Bed/Room
AYUSHMAN Dr Raj Kumar Sharma Ranchi
38 Year Female 24/12/2015 26/12/2018 CABIN-102
PRESENTING COMPLAINTS : Came with C/O Pain (R) side of abdomen.
PAST HISTORY : No H/O Dm & HTN.
PHYSICAL FINDINGS :
FINAL DIAGNOSIS : Right Ureteric Stone.
CLINICAL IMPRESSION : Uneventful.
PROCEDURE DONE : Cystoscopy (N), Right Sided URS showed Ureteric Stone which was fragmented, it migrated

In [None]:
def plot_TSNE(dt_distribution):
    X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

    X = np.array([i[0] for i in X_embedded])
    Y = np.array([i[1] for i in X_embedded])
    C = dt_distribution.argmax(axis=1)
    for i in range(10):
        xx = X[[np.where(C == i)[0].tolist()]]
        yy = Y[[np.where(C == i)[0].tolist()]]
        plt.scatter(xx, yy, label=str(i))

    plt.title('Document-Topic Distribution')
    plt.legend(loc=(1.04,0))
    plt.show()
    
def get_doc_details(num):
    print("label: ", C[num])
    print(dataset[9][num])

In [None]:
dump[-1][3][3].argmax(axis=1)[0]

In [None]:
plot_TSNE

In [None]:
# for idx, i in enumerate(dump):
#     if idx == 23:
#         for j in i[3][2]:
#             print(i[3][2][j])

In [12]:
%%time
results =  grid_search(N_TOPICS=9, edges_threshold=0.3, lambda_param=1.0)

CPU times: user 6min 58s, sys: 50.6 ms, total: 6min 58s
Wall time: 6min 58s


In [13]:
results[0]

-7.478090917703349

In [32]:
results[3]

<lda2.LdaSampler at 0x7fe2a18f6190>

In [15]:
results

(-7.478090917703349,
 0.13183622785275587,
 {0: [u'unit', u'diabetic', u'consciousness', u'chemotherapy', u'event'],
  1: [u'abdominal', u'unilateral', u'renal', u'oriented', u'lung'],
  2: [u'inguinal', u'skin', u'cell', u'appendix', u'urine'],
  3: [u'reactive', u'urea', u'antibiotic', u'platelet', u'non'],
  4: [u'artery', u'using', u'angioplasty', u'dominant', u'proximal'],
  5: [u'breast', u'multiple', u'node', u'radiotherapy', u'cancer'],
  6: [u'schedule', u'distress', u'stage', u'preterm', u'sepsis'],
  7: [u'complication', u'uterus', u'murmur', u'pallor', u'cyanosis'],
  8: [u'tibia', u'joint', u'wound', u'fever', u'due']},
 <lda2.LdaSampler at 0x7f9e6391d810>)

In [None]:
# Centroid
# for i in results[2]:
#     a = (results[2][i])
#     avg_dist = []
#     for i in a:
#         sum = 0
#         for  j in a:
#             if i!=j:
#                 distance = get_cosine(embeddings_index[i], embeddings_index[j])
#                 sum += distance
#         avg_dist.append(sum)
#     print(np.array(avg_dist).argmin(), a[np.array(avg_dist).argmin()], a)