In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import gammaln
from collections import Counter
from textblob import TextBlob

import pickle
import gensim
import imp
import lda2
import scipy
import operator
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle

In [2]:
def processReviews(reviews, window=5, MAX_VOCAB_SIZE=1000):
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None)
    train_data_features = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    wordOccurenceMatrix = train_data_features.toarray()
    return wordOccurenceMatrix, vocabulary, words

def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [3]:
def coherence_score(X, topic_sentiment_df):
    X[X>1] = 1    
    totalcnt = len(topic_sentiment_df)
    total = 0
    for allwords in topic_sentiment_df:
        for word1 in allwords:
            for word2 in allwords:
                if word1 != word2:
                    ind1 = vocabulary[word1]
                    ind2 = vocabulary[word2]
                    total += np.log((np.matmul(X[:,ind1].T, X[:,ind2]) + 1.0)/np.sum(X[:,ind2]))
    return total/(2*totalcnt)

def kl_score(pk,qk):
    return (scipy.stats.entropy(pk,qk)*.5 + scipy.stats.entropy(qk,pk)*.5)

def get_hscore(dt_distribution, X, k):
    testlen = X.shape[0]
    all_kl_scores = np.zeros((testlen, testlen))
    for i in range(testlen-1):
        for j in range(i+1,testlen):
            score = kl_score(dt_distribution[i],dt_distribution[j])
            all_kl_scores[i,j] = score
            all_kl_scores[j,i] = score

    dt = np.zeros((X.shape[0], k))

    for i in range(X.shape[0]):
        dt[i, dt_distribution[i].argmax()]=1

    intradist = 0
    for i in range(k):
        cnt = dt[:,i].sum()
        tmp = np.outer(dt[:,i],dt[:,i])
        tmp = tmp * all_kl_scores
        intradist += tmp.sum()*1.0/(cnt*(cnt-1))
#         print(cnt, tmp.sum(), intradist)
    intradist = intradist/k
    

    interdist = 0
    for i in range(k):
       for j in range(k):
           if i != j:
             cnt_i = dt[:,i].sum()
             cnt_j = dt[:,j].sum()
             tmp = np.outer(dt[:,i], dt[:,j])
             tmp = tmp * all_kl_scores
             interdist += tmp.sum()*1.0/(cnt_i*cnt_j)
    interdist = interdist/(k*(k-1))
    return intradist/interdist

In [4]:
%%time
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CPU times: user 3min 25s, sys: 5.87 s, total: 3min 31s
Wall time: 3min 31s


In [25]:
icd_words = pd.read_csv("strict_medical_words.txt", header=None)[0].values.tolist()

In [26]:
# icd_words = pickle.load(open("unique_words.pickle",'rb'))

In [27]:
dataset = pd.read_pickle("dataset_cleaned_autocorrected")

In [28]:
dataset[8] = dataset[7].apply(lambda x: [item for item in x.split(" ") if item in icd_words])
dataset[9] = dataset[8].apply(lambda x: " ".join(x))

In [29]:
dataset.to_pickle("dataset_cleaned_autocorrected_strict_medical_only")

In [32]:
matrix, vocabulary, words = processReviews(dataset[9].values)

In [33]:
def grid_search(N_TOPICS, lambda_param, edges_threshold):
    k = N_TOPICS
    docs_edges = []
    count = 0
    ignored = []
    taken = []
    for idx, doc in enumerate(dataset[8].values):
        edges = []
        for i in doc:
            for j in doc:
                if i != j:
                    try:
                        a = embeddings_index[i]
                        b = embeddings_index[j]
                        if get_cosine(a, b) > edges_threshold:
                            edges.append((vocabulary[i], vocabulary[j]))
                    except:
                        try:
                            embeddings_index[i]
                            taken.append(i)
                        except:
                            ignored.append(i)
                        try:
                            embeddings_index[j]
                        except:
                            ignored.append(j)
                            taken.append(j)
                        pass
        docs_edges.append(edges)

    edge_dict = {}
    for i in docs_edges:
        for j in i:
            try:
                edge_dict[j[0]] += [j[1]]
            except:
                edge_dict[j[0]] = [j[1]]
    sampler = lda2.LdaSampler(n_topics=N_TOPICS, lambda_param=lambda_param)

    for it, phi in enumerate(sampler.run(matrix, edge_dict)):
        sampler.loglikelihood(docs_edges)



    t_words = sampler.getTopKWords(5, words)
    top_words = [t_words[i] for i in t_words.keys()]

    return coherence_score(matrix, top_words), get_hscore(sampler.theta(), matrix, k), t_words, sampler.theta()

In [34]:
N_TOPICS = [7, 9]

In [35]:
edges_threshold = [0.3, 0.5, 0.7]

In [36]:
lambda_param = [0.3, 0.7, 1.0]

In [37]:
dump = []
for i in N_TOPICS:
    for j in edges_threshold:
        for k in lambda_param:
            dump.append((i, j, k, grid_search(N_TOPICS=i, edges_threshold=j, lambda_param=k)))

In [38]:
for idx, i in enumerate(dump):
    print("")
    print(idx, i[0], i[1], i[2], i[3][0], i[3][1])
    for j in i[3][2]:
        print(i[3][2][j])


(0, 7, 0.3, 0.3, -9.10807374481516, 0.10943909337950633)
[u'hiv', u'appendix', u'renal', u'urine', u'cell']
[u'stage', u'distress', u'schedule', u'screening', u'sepsis']
[u'anesthetic', u'radiation', u'node', u'radiotherapy', u'cancer']
[u'non', u'sac', u'diabetic', u'hip', u'knee']
[u'allergy', u'cyanosis', u'diagnostic', u'chemotherapy', u'consciousness']
[u'allergy', u'abdominal', u'fever', u'oriented', u'lung']
[u'artery', u'using', u'angioplasty', u'dominant', u'proximal']

(1, 7, 0.3, 0.7, -8.432179062289256, 0.1548748443715706)
[u'stage', u'diabetic', u'consciousness', u'cycle', u'event']
[u'anesthesia', u'anesthetic', u'institution', u'genetic', u'sac']
[u'preterm', u'stage', u'screening', u'due', u'respiratory']
[u'residual', u'angioplasty', u'using', u'proximal', u'artery']
[u'urology', u'carcinoma', u'cancer', u'fever', u'abdominal']
[u'wound', u'stomach', u'skin', u'hiv', u'urine']
[u'murmur', u'pallor', u'cell', u'cyanosis', u'platelet']

(2, 7, 0.3, 1.0, -7.7812096553632

In [43]:
dataset[9][0]

u'multi trauma cerebrovascular accident fibula hemiplegia due parietal lobe weakness slurred associated special multi'

In [None]:
def plot_TSNE(dt_distribution):
    X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

    X = np.array([i[0] for i in X_embedded])
    Y = np.array([i[1] for i in X_embedded])
    C = dt_distribution.argmax(axis=1)
    for i in range(10):
        xx = X[[np.where(C == i)[0].tolist()]]
        yy = Y[[np.where(C == i)[0].tolist()]]
        plt.scatter(xx, yy, label=str(i))

    plt.title('Document-Topic Distribution')
    plt.legend(loc=(1.04,0))
    plt.show()
    
def get_doc_details(num):
    print("label: ", C[num])
    print(dataset[9][num])

In [44]:
dump[0]

(7,
 0.3,
 0.3,
 (-9.10807374481516,
  0.10943909337950633,
  {0: [u'hiv', u'appendix', u'renal', u'urine', u'cell'],
   1: [u'stage', u'distress', u'schedule', u'screening', u'sepsis'],
   2: [u'anesthetic', u'radiation', u'node', u'radiotherapy', u'cancer'],
   3: [u'non', u'sac', u'diabetic', u'hip', u'knee'],
   4: [u'allergy',
    u'cyanosis',
    u'diagnostic',
    u'chemotherapy',
    u'consciousness'],
   5: [u'allergy', u'abdominal', u'fever', u'oriented', u'lung'],
   6: [u'artery', u'using', u'angioplasty', u'dominant', u'proximal']},
  array([[0.00680272, 0.00680272, 0.00680272, 0.82312925, 0.14285714,
          0.00680272, 0.00680272],
         [0.03703704, 0.03703704, 0.03703704, 0.03703704, 0.03703704,
          0.77777778, 0.03703704],
         [0.01754386, 0.01754386, 0.01754386, 0.01754386, 0.01754386,
          0.71929825, 0.19298246],
         [0.01298701, 0.01298701, 0.92207792, 0.01298701, 0.01298701,
          0.01298701, 0.01298701],
         [0.00934579, 0.0093

In [None]:
# for idx, i in enumerate(dump):
#     if idx == 23:
#         for j in i[3][2]:
#             print(i[3][2][j])

In [None]:
%%time
results =  grid_search(N_TOPICS=9, edges_threshold=0.7, lambda_param=1.0)

In [None]:
results[2]

In [None]:
# Centroid
# for i in results[2]:
#     a = (results[2][i])
#     avg_dist = []
#     for i in a:
#         sum = 0
#         for  j in a:
#             if i!=j:
#                 distance = get_cosine(embeddings_index[i], embeddings_index[j])
#                 sum += distance
#         avg_dist.append(sum)
#     print(np.array(avg_dist).argmin(), a[np.array(avg_dist).argmin()], a)