In [1]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime
import multiprocessing

import numpy as np
import pandas as pd
#import LDA_ELJST as lda
import ELJST_script_unigram as lda
#import LJST_script_BTM as lda
#import ELJST_script_BTM as lda
import matplotlib.pyplot as plt



In [2]:
import utils as my_utils

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['cleaned'] = my_utils.preprocess(df['reviewText'])
    df['text'] = df['cleaned'].apply(lambda x: " ".join(x))
    return df

In [5]:
dataset = getDF('datasets/reviews_Musical_Instruments_5.json.gz')
dataset.shape

(10261, 9)

In [6]:
n_cores = 35

In [7]:
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

(10261, 11)

In [8]:
Counter(dataset.overall)

Counter({5.0: 6938, 3.0: 772, 4.0: 2084, 2.0: 250, 1.0: 217})

In [9]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [10]:
%%time
embedding_dim = 300
embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

Loading Glove Model


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Done. 400000  words loaded!
CPU times: user 32.8 s, sys: 1.45 s, total: 34.2 s
Wall time: 33.9 s


In [11]:
n_topics = 5
n_sentiment = 5
alpha = 0.1/n_topics * np.ones(n_topics)
beta = .01
gamma = 10
gamma = [gamma/(n_topics*n_sentiment)]*n_sentiment

maxiter = 10
lambda_param = 1

In [12]:
gamma

[0.4, 0.4, 0.4, 0.4, 0.4]

In [13]:
imp.reload(lda)

<module 'ELJST_script_unigram' from '/home/william18026/Embedding-LJST/ELJST_script_unigram.py'>

In [14]:
sampler = lda.SentimentLDAGibbsSampler(n_topics, alpha, beta, gamma, numSentiments=n_sentiment, minlabel = 0, 
                                       maxlabel = 5, SentimentRange = 5, max_df = .5, min_df = 5, 
                                       lambda_param = lambda_param)

In [15]:
sampler._initialize_(reviews = dataset.text.tolist(), labels = dataset.overall.tolist())

In [16]:
%%time
word_embeddings = []

for word in tqdm(sampler.words):
    emb = embeddings_index.get(word, np.array([0]*embedding_dim))
    word_embeddings.append(emb)

word_embeddings = np.array(word_embeddings)

HBox(children=(IntProgress(value=0, max=5181), HTML(value='')))


CPU times: user 136 ms, sys: 35.1 ms, total: 171 ms
Wall time: 169 ms


In [17]:
cutoff = .4
word_similarity = cosine_similarity(word_embeddings)
word_similarity = word_similarity > cutoff
word_similarity = word_similarity.astype(int)
np.fill_diagonal(word_similarity, 0)

In [18]:
wordOccuranceMatrixBinary = sampler.wordOccuranceMatrix.copy()
wordOccuranceMatrixBinary[wordOccuranceMatrixBinary > 1] = 1
wordOccuranceMatrixBinary.shape

(10261, 5181)

In [19]:
def get_edges(i):
    t = np.where(i==0)[0]
    k = word_similarity.copy()

    k[t, :] = 0
    k[:, t] = 0

    z, x = np.where(k==1)
    embeds = {}
    for p, q in zip(z, x):
        try:
            embeds[p] += [q]
        except:
            embeds[p] = [q]
        try:
            embeds[q] += [p]
        except:
            embeds[q] = [p]
    for i in embeds.keys():
        embeds[i] = list(set(embeds[i]))
    return embeds

In [20]:
# %%time
# pool = multiprocessing.Pool(n_cores)
# similar_words = pool.map(get_edges, wordOccuranceMatrixBinary)
# pool.close()

In [21]:
similar_words = pickle.load(open("resources/amazon_muiscal_glove_0.4.pickle","rb"))

In [22]:
sampler.run(reviews=dataset.text.tolist(), labels=dataset.overall.tolist(), 
            similar_words=similar_words, mrf=True, maxIters=maxiter)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Starting iteration 1 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 2 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 3 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 4 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 5 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 6 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 7 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 8 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 9 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))


Starting iteration 10 of 10


HBox(children=(IntProgress(value=0, max=10261), HTML(value='')))





In [23]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),
                 sampler.dt_distribution.argmax(axis=1), metric='precomputed')

-0.018280391178034913

In [24]:
davies_bouldin_score(sampler.wordOccuranceMatrix, sampler.dt_distribution.argmax(axis=1))

45.134137519121204

In [25]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, list(sampler.getTopKWords(5).values()), sampler.vocabulary)

-15.628568384084634

In [31]:
%%time
my_utils.get_hscore_multi(sampler.dt_distribution, sampler.wordOccuranceMatrix, n_topics, 3000)

CPU times: user 59.7 s, sys: 9.19 s, total: 1min 8s
Wall time: 1min 9s


0.10645005373207272

### Appendix

In [27]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()