In [41]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import gammaln
from collections import Counter
from textblob import TextBlob

import imp
import lda2
import scipy
import operator
import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle

In [42]:
def processReviews(reviews, window=5, MAX_VOCAB_SIZE=1000):
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None)
    # preprocessor=None,stop_words="english",max_features=MAX_VOCAB_SIZE,max_df=.5,min_df=5)
    train_data_features = vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    wordOccurenceMatrix = train_data_features.toarray()
    return wordOccurenceMatrix, vocabulary, words

def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [43]:
def coherence_score(X, topic_sentiment_df):
    X[X>1] = 1    
    totalcnt = len(topic_sentiment_df)
    total = 0
    for allwords in topic_sentiment_df:
        for word1 in allwords:
            for word2 in allwords:
                if word1 != word2:
                    ind1 = vocabulary[word1]
                    ind2 = vocabulary[word2]
                    total += np.log((np.matmul(X[:,ind1].T, X[:,ind2]) + 1.0)/np.sum(X[:,ind2]))
    return total/(2*totalcnt)

def kl_score(pk,qk):
    return (scipy.stats.entropy(pk,qk)*.5 + scipy.stats.entropy(qk,pk)*.5)

def get_hscore(dt_distribution, X, k):
    testlen = X.shape[0]
    all_kl_scores = np.zeros((testlen, testlen))
    for i in range(testlen-1):
        for j in range(i+1,testlen):
            score = kl_score(dt_distribution[i],dt_distribution[j])
            all_kl_scores[i,j] = score
            all_kl_scores[j,i] = score

    dt = np.zeros((X.shape[0], k))

    for i in range(X.shape[0]):
        dt[i, dt_distribution[i].argmax()]=1

    intradist = 0
    for i in range(k):
        cnt = dt[:,i].sum()
        tmp = np.outer(dt[:,i],dt[:,i])
        tmp = tmp * all_kl_scores
        intradist += tmp.sum()*1.0/(cnt*(cnt-1))
#         print(cnt, tmp.sum(), intradist)
    intradist = intradist/k
    

    interdist = 0
    for i in range(k):
       for j in range(k):
           if i != j:
             cnt_i = dt[:,i].sum()
             cnt_j = dt[:,j].sum()
             tmp = np.outer(dt[:,i], dt[:,j])
             tmp = tmp * all_kl_scores
             interdist += tmp.sum()*1.0/(cnt_i*cnt_j)
    interdist = interdist/(k*(k-1))
    return intradist/interdist

In [44]:
import gensim
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin', binary=True)

In [45]:
import pickle

In [46]:
icd_words = pickle.load(open("unique_words.pickle",'rb'))

In [47]:
dataset = pd.read_pickle("dataset_cleaned_autocorrected")

In [48]:
dataset[8] = dataset[7].apply(lambda x: [item for item in x.split(" ") if item in icd_words])
dataset[9] = dataset[8].apply(lambda x: " ".join(x))

In [49]:
dataset.to_pickle("dataset_cleaned_autocorrected_medical_only")

In [50]:
matrix, vocabulary, words = processReviews(dataset[9].values)

In [51]:
def grid_search(N_TOPICS, lambda_param, edges_threshold):
    k = N_TOPICS
    docs_edges = []
    count = 0
    ignored = []
    taken = []
    for idx, doc in enumerate(dataset[8].values):
        edges = []
        for i in doc:
            for j in doc:
                if i != j:
                    try:
                        a = embeddings_index[i]
                        b = embeddings_index[j]
                        if get_cosine(a, b) > edges_threshold:
                            edges.append((vocabulary[i], vocabulary[j]))
                    except:
                        try:
                            embeddings_index[i]
                            taken.append(i)
                        except:
                            ignored.append(i)
                        try:
                            embeddings_index[j]
                        except:
                            ignored.append(j)
                            taken.append(j)
                        pass
        docs_edges.append(edges)

    edge_dict = {}
    for i in docs_edges:
        for j in i:
            try:
                edge_dict[j[0]] += [j[1]]
            except:
                edge_dict[j[0]] = [j[1]]
    sampler = lda2.LdaSampler(n_topics=N_TOPICS, lambda_param=lambda_param)

    for it, phi in enumerate(sampler.run(matrix, edge_dict)):
        sampler.loglikelihood(docs_edges)



    t_words = sampler.getTopKWords(5, words)
    top_words = [t_words[i] for i in t_words.keys()]

    return coherence_score(matrix, top_words), get_hscore(sampler.theta(), matrix, k), t_words, sampler.theta()

In [52]:
N_TOPICS = [3, 7, 10]

In [53]:
edges_threshold = [0.3, 0.5, 0.7]

In [54]:
lambda_param = [0.3, 0.7, 1.0]

In [55]:
# dump = []
# for i in N_TOPICS:
#     for j in edges_threshold:
#         for k in lambda_param:
#             dump.append((i, j, k, grid_search(N_TOPICS=i, edges_threshold=j, lambda_param=k)))

In [56]:
# for idx, i in enumerate(dump):
#     if i[3][0] > -6 and i[3][1] < 0.3:
#         print(idx, i[0], i[1], i[2], i[3][0], i[3][1])

In [57]:
# for idx, i in enumerate(dump):
#     if idx == 23:
#         for j in i[3][2]:
#             print(i[3][2][j])

In [58]:
results =  grid_search(N_TOPICS=10, edges_threshold=0.7, lambda_param=0.3)

In [67]:
for i in results[2]:
    a = (results[2][i])
    avg_dist = []
    for i in a:
        sum = 0
        for  j in a:
            if i!=j:
                distance = get_cosine(embeddings_index[i], embeddings_index[j])
                sum += distance
        avg_dist.append(sum)
    print(np.array(avg_dist).argmin(), a[np.array(avg_dist).argmin()], a)

(0, u'area', [u'area', u'carcinoma', u'bilateral', u'neck', u'node'])
(0, u'consciousness', [u'consciousness', u'event', u'free', u'chemotherapy', u'cycle'])
(2, u'front', [u'house', u'sepsis', u'front', u'respiratory', u'baby'])
(1, u'arch', [u'stage', u'arch', u'central', u'carcinoma', u'lower'])
(0, u'absent', [u'absent', u'new', u'valve', u'presenting', u'drug'])
(0, u'oriented', [u'oriented', u'category', u'lung', u'surgical', u'visit'])
(1, u'inguinal', [u'ray', u'inguinal', u'multiple', u'closed', u'cell'])
(4, u'lap', [u'skin', u'hiv', u'appendix', u'urine', u'lap'])
(0, u'angioplasty', [u'angioplasty', u'mild', u'dominant', u'proximal', u'disease'])
(0, u'due', [u'due', u'calculus', u'count', u'blood', u'renal'])


In [65]:
results[2]

{0: [u'area', u'carcinoma', u'bilateral', u'neck', u'node'],
 1: [u'consciousness', u'event', u'free', u'chemotherapy', u'cycle'],
 2: [u'house', u'sepsis', u'front', u'respiratory', u'baby'],
 3: [u'stage', u'arch', u'central', u'carcinoma', u'lower'],
 4: [u'absent', u'new', u'valve', u'presenting', u'drug'],
 5: [u'oriented', u'category', u'lung', u'surgical', u'visit'],
 6: [u'ray', u'inguinal', u'multiple', u'closed', u'cell'],
 7: [u'skin', u'hiv', u'appendix', u'urine', u'lap'],
 8: [u'angioplasty', u'mild', u'dominant', u'proximal', u'disease'],
 9: [u'due', u'calculus', u'count', u'blood', u'renal']}