In [1]:
from tqdm import tqdm
from collections import Counter
from itertools import combinations
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import euclidean_distances

import imp
import copy
import pickle
import multiprocessing

import numpy as np
import pandas as pd
import utils as my_utils
import ELJST_script_unigram as lda
import matplotlib.pyplot as plt



In [2]:
min_df = 5
max_df = .5
maxIters = 5

beta = .01
gamma = 10
n_topics = 5
n_sentiment = 5
lambda_param = 1.0

In [3]:
cutoff = 0.3
dataset_name = "amazon_electronics"
n_docs = 100000

In [4]:
dataset = pd.read_pickle("resources/"+ dataset_name + "_" + str(n_docs) + "_dataset")

In [5]:
embedding_name = "glove_0.6"

In [6]:
similar_words = pickle.load(open("resources/"+ dataset_name + "_" + str(n_docs) + "_" +embedding_name + ".pickle","rb"))

In [7]:
alpha = 0.1/n_topics * np.ones(n_topics)
gamma = [gamma/(n_topics*n_sentiment)]*n_sentiment

In [8]:
# imp.reload(lda)

In [9]:
sampler = lda.SentimentLDAGibbsSampler(n_topics, alpha, beta, gamma, numSentiments=n_sentiment, minlabel = 0, 
                                       maxlabel = 5, SentimentRange = 5, max_df = max_df, min_df = min_df, 
                                       lambda_param = lambda_param)

In [10]:
%%time
sampler._initialize_(reviews = dataset.text.tolist(), labels = dataset.sentiment.tolist())

10696 (100000, 10696)
CPU times: user 3min 7s, sys: 7.41 s, total: 3min 15s
Wall time: 3min 15s


In [11]:
sampler.wordOccuranceMatrix.sum()

3055259

In [20]:
self = sampler

In [24]:
reviews=dataset.text.tolist()
labels=dataset.sentiment.tolist()
similar_words=similar_words
mrf=True
maxIters=maxIters

In [None]:
%%time
self.loglikelihoods = np.zeros(maxIters)
numDocs, vocabSize = self.wordOccuranceMatrix.shape

self.docs_edges = []
for i in similar_words:
    edges = []
    for j in i.keys():
        for p in i[j]:
            edges.append([j, p])
    self.docs_edges.append(edges)

In [12]:
def run(self, reviews, labels, similar_words, unlabeled_reviews=[], mrf = True, maxIters=100):
    self.loglikelihoods = np.zeros(maxIters)
    numDocs, vocabSize = self.wordOccuranceMatrix.shape

    self.docs_edges = []
    for i in similar_words:
        edges = []
        for j in i.keys():
            for p in i[j]:
                edges.append([j, p])
        self.docs_edges.append(edges)

    for iteration in range(maxIters):
        print ("Starting iteration %d of %d" % (iteration + 1, maxIters))
        loglikelihood = 0
        for idx, d in enumerate(trange(numDocs)):
            for i, v in enumerate(word_indices(self.wordOccuranceMatrix[d, :])):
                t = self.topics[(d, i)]
                s = self.sentiments[(d, i)]
                self.n_dt[d, t] -= 1
                self.n_d[d] -= 1
                self.n_dts[d, t, s] -= 1
                self.n_vts[v, t, s] -= 1
                self.n_ts[t, s] -= 1
                self.vts[v,t,s] = 0

                probabilities_ts = self.conditionalDistribution(d, v, similar_words[idx], mrf)
                ind = sampleFromCategorical(probabilities_ts.flatten())
                t, s = np.unravel_index(ind, probabilities_ts.shape)

                self.probabilities_ts[(d,v)] = probabilities_ts[t,s]

                self.topics[(d, i)] = t
                self.sentiments[(d, i)] = s
                self.n_dt[d, t] += 1
                self.n_d[d] += 1
                self.n_dts[d, t, s] += 1
                self.n_vts[v, t, s] += 1
                self.n_ts[t, s] += 1
                self.vts[v,t,s] = 1

In [14]:
from tqdm import trange

In [16]:
def word_indices(wordOccuranceVec):
    """
    Turn a document vector of size vocab_size to a sequence
    of word indices. The word indices are between 0 and
    vocab_size-1. The sequence length is equal to the document length.
    """
    for idx in wordOccuranceVec.nonzero()[0]:
        for i in range(int(wordOccuranceVec[idx])):
            yield idx
            

In [18]:

def sampleFromDirichlet(alpha):
    """
    Sample from a Dirichlet distribution
    alpha: Dirichlet distribution parameter (of length d)
    Returns:
    x: Vector (of length d) sampled from dirichlet distribution
    """
    return np.random.dirichlet(alpha)

def sampleFromCategorical(theta):
    """
    Samples from a categorical/multinoulli distribution
    theta: parameter (of length d)
    Returns:
    x: index ind (0 <= ind < d) based on probabilities in theta
    """
    theta = theta/np.sum(theta)
    return np.random.multinomial(1, theta).argmax()


In [19]:
run(sampler, reviews=dataset.text.tolist(), labels=dataset.sentiment.tolist(), 
            similar_words=similar_words, mrf=True, maxIters=maxIters)

  0%|          | 1/100000 [00:00<4:41:30,  5.92it/s]

Starting iteration 1 of 5


  0%|          | 52/100000 [00:08<7:55:53,  3.50it/s]


KeyboardInterrupt: 

In [None]:
sampler.run(reviews=dataset.text.tolist(), labels=dataset.sentiment.tolist(), 
            similar_words=similar_words, mrf=True, maxIters=maxIters)

In [None]:
Counter([len(i) for i in sampler.docs_edges])

# Evaluations

In [None]:
plt.plot(sampler.loglikelihood_history)

In [None]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),
                 sampler.dt_distribution.argmax(axis=1), metric='precomputed')

In [None]:
davies_bouldin_score(sampler.wordOccuranceMatrix, sampler.dt_distribution.argmax(axis=1))

In [None]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, list(sampler.getTopKWords(5).values()), sampler.vocabulary)

In [None]:
%%time
my_utils.get_hscore_multi(sampler.dt_distribution, sampler.wordOccuranceMatrix, n_topics, 500)

In [None]:
sampler.loglikelihood()

In [None]:
sampler.perplexity()

### Appendix

In [None]:
# p = [item for sublist in dataset['cleaned'].tolist() for item in sublist]

In [None]:
# sorted(Counter(p))

In [None]:
# def process_l(s):
#     return [i.lemma_ for i in sp(s) if i.lemma_ not in '-PRON-']

In [None]:
# l = dataset['text'].tolist()

In [None]:
# pool = multiprocessing.Pool(n_cores)
# processed_l = pool.map(process_l, l)
# pool.close()

In [None]:
# joblib.dump(sampler, "resources/sampler_20iter_0.5_1")

In [None]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()