#### Dependencies

In [1]:
# !pip install numpy num2words nltk pandas Observations gensim
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# !pip install -U textblob
# !python -m textblob.download_corpora

#### Imports

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from num2words import num2words
from collections import Counter
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.special import gammaln
from collections import Counter
from textblob import TextBlob
from sklearn.manifold import TSNE
from pylab import savefig
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from scipy.stats import chi2

import gzip
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import imp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim
import pickle
# import lda2
import lda_informativeness_usecase as lda
import scipy
import operator
import nltk
import os
import string
import copy
import pickle

In [3]:
imp.reload(lda)

<module 'lda_informativeness_usecase' from 'lda_informativeness_usecase.pyc'>

### Preprocessing Methods

In [100]:
def convert_numbers(k):
    for i in range(len(k)):
        try:
            num2words(int(k[i]))
            k[i] = " "
        except:
            pass
    return k

def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\t'), ' ')
    pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: convert_numbers(x))
    pd = pd.str.join(' ')
    
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])    
    pd = pd.apply(lambda x: [item for item in x if item not in stop_words])
    return pd

def processReviews(reviews, window=5, MAX_VOCAB_SIZE=2000):
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None, max_features=MAX_VOCAB_SIZE)
    count_matrix = vectorizer.fit_transform(reviews)
    tfidf_vectorizer = TfidfVectorizer(max_features=MAX_VOCAB_SIZE)
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    return count_matrix.toarray(), tfidf_matrix.toarray(), vocabulary, words

### Score Methods

In [5]:
def coherence_score(X, topic_sentiment_df):
    X[X>1] = 1    
    totalcnt = len(topic_sentiment_df)
    total = 0
    for allwords in topic_sentiment_df:
        for word1 in allwords:
            for word2 in allwords:
                if word1 != word2:
                    ind1 = vocabulary[word1]
                    ind2 = vocabulary[word2]
                    total += np.log((np.matmul(X[:,ind1].T, X[:,ind2]) + 1.0)/np.sum(X[:,ind2]))
    return total/(2*totalcnt)

def kl_score(pk,qk):
    return (scipy.stats.entropy(pk,qk)*.5 + scipy.stats.entropy(qk,pk)*.5)

def get_hscore(dt_distribution, X, k):
    testlen = X.shape[0]
    all_kl_scores = np.zeros((testlen, testlen))
    for i in range(testlen-1):
        for j in range(i+1,testlen):
            score = kl_score(dt_distribution[i],dt_distribution[j])
            all_kl_scores[i,j] = score
            all_kl_scores[j,i] = score

    dt = np.zeros((X.shape[0], k))

    for i in range(X.shape[0]):
        dt[i, dt_distribution[i].argmax()]=1

    intradist = 0
    for i in range(k):
        cnt = dt[:,i].sum()
        tmp = np.outer(dt[:,i],dt[:,i])
        tmp = tmp * all_kl_scores
        intradist += tmp.sum()*1.0/(cnt*(cnt-1))
    intradist = intradist/k
    

    interdist = 0
    for i in range(k):
       for j in range(k):
           if i != j:
             cnt_i = dt[:,i].sum()
             cnt_j = dt[:,j].sum()
             tmp = np.outer(dt[:,i], dt[:,j])
             tmp = tmp * all_kl_scores
             interdist += tmp.sum()*1.0/(cnt_i*cnt_j)
    interdist = interdist/(k*(k-1))
    return intradist/interdist

### Required Methods

In [6]:
def plot_simple_TSNE(data, title):
    X_embedded = TSNE(n_components=2).fit_transform(data)

    X = np.array([i[0] for i in X_embedded])
    Y = np.array([i[1] for i in X_embedded])
    plt.scatter(X, Y)

    plt.title(title)
    plt.legend(loc=(1.04,0))
    plt.show()

def plot_TSNE(dt_distribution, C, labels, printit, title):
    X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)
    plt.figure(figsize=(10, 10))
    X = np.array([i[0] for i in X_embedded])
    Y = np.array([i[1] for i in X_embedded])
    for i in range(len(labels)):
        xx = X[[np.where(C == i)[0].tolist()]]
        yy = Y[[np.where(C == i)[0].tolist()]]
        plt.scatter(xx, yy, label=labels[i])

    if printit:
        for idx, p in enumerate(X_embedded):
            plt.annotate(idx, (p[0], p[1]))
    plt.title(title)
    plt.legend(loc=(1.04,0))
    plt.show()
    
def get_doc_details(num):
    print("label: ", C[num])
    print(dataset[9].values[num])

### Read Data

In [7]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [8]:
embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

Loading Glove Model
('Done.', 1917494, ' words loaded!')


In [9]:
# embeddings_index = gensim.models.KeyedVectors.load_word2vec_format('glove.42B.300d.txt')

In [10]:
# def parse(path):
#     g = gzip.open(path, 'r')
#     for l in g:
#         yield eval(l)

In [79]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [101]:
N_docs = 2000

In [102]:
dataset = parse("resources/reviews_Musical_Instruments_5.json.gz")

In [103]:
dataset = pd.DataFrame(list(dataset))

In [104]:
dataset = dataset.head(N_docs)

In [105]:
reviews = dataset['reviewText'].values

In [106]:
ratings = dataset['overall'].values[:N_docs]

In [107]:
len(ratings)

2000

In [108]:
stop_words = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [109]:
dataset[8] = preprocess(dataset['reviewText'])
dataset[9] = dataset[8].apply(lambda x: " ".join(x))

In [110]:
count_matrix, tfidf_matrix, vocabulary, words = processReviews(dataset[9].values)

In [111]:
count_matrix.shape, tfidf_matrix.shape

((2000, 2000), (2000, 2000))

### Find Edges

In [112]:
edges_threshold = 0.8

In [113]:
docs_edges, ignored, taken, count = [], [], [], 0
for idx, doc in enumerate(dataset[8].values):
    edges = []
    for i in doc:
        for j in doc:
            if i != j:
                try:
                    a = embeddings_index[i]
                    b = embeddings_index[j]
                    if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
                        edges.append((vocabulary[i], vocabulary[j]))
                except:
                    try:
                        embeddings_index[i]
                        taken.append(i)
                    except:
                        ignored.append(i)
                    try:
                        embeddings_index[j]
                    except:
                        ignored.append(j)
                        taken.append(j)
                    pass
    docs_edges.append(edges)

In [114]:
edge_dict = {}
for i in docs_edges:
    for j in i:
        try:
            edge_dict[j[0]] += [j[1]]
            edge_dict[j[1]] += [j[0]]
        except:
            edge_dict[j[0]] = [j[1]]
            edge_dict[j[1]] = [j[0]]

In [115]:
for i in edge_dict.keys():
    edge_dict[i] = list(set(edge_dict[i]))

In [116]:
len(edge_dict)

332

## Run Model

In [117]:
k = N_TOPICS = 9
N_SENTIMENT = 5
lambda_param = 1.0

In [118]:
imp.reload(lda)

<module 'lda_informativeness_usecase' from 'lda_informativeness_usecase.pyc'>

In [119]:
sampler = lda.LdaSampler(n_sentiment = N_SENTIMENT, n_topics=N_TOPICS, lambda_param=lambda_param)

In [120]:
for i in range(10):
    print(i)
    sampler.run(count_matrix, ratings, edge_dict, maxiter=20)
    print(sampler.loglikelihood(docs_edges))

0
-593823.3535130287
1
-591720.1200205017
2
-585608.8595618601
3


KeyboardInterrupt: 

In [None]:
t_words = sampler.getTopKWords(5, words)
top_words = [t_words[i] for i in t_words.keys()]
document_topic = sampler.theta().argmax(axis=1)

In [None]:
t_words

### Evaluation

In [None]:
coherence_score(count_matrix, top_words)

In [None]:
get_hscore(sampler.theta(), count_matrix, k)

In [None]:
topics_generated = sampler.theta().argmax(axis=1)
document_word_sampler = np.dot(sampler.theta(), sampler.phi())

In [None]:
top_words

### Visualizing D-T

In [None]:
labels = list(xrange(N_TOPICS))
# labels = ['Treatment', 'Heart Surgery', 'Symptoms', 'Procedure', 'Pregnency', 'Clinical', 'Tests', 'Cancer', 'Abdomen']
print(labels)

In [None]:
plot_TSNE(document_word_sampler, topics_generated, labels, True, "")