In [1]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime
import multiprocessing
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
#import LDA_ELJST as lda
import ELJST_script_unigram as lda
#import LJST_script_BTM as lda
#import ELJST_script_BTM as lda
import matplotlib.pyplot as plt



In [2]:
import utils as my_utils

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['cleaned'] = my_utils.preprocess(df['reviewText'])
    df['text'] = df['cleaned'].apply(lambda x: " ".join(x))
    return df

In [5]:
dataset = getDF('datasets/reviews_Musical_Instruments_5.json.gz')
dataset.shape

(10261, 9)

In [6]:
n_cores = 35

In [7]:
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

(10261, 11)

In [8]:
Counter(dataset.overall)

Counter({5.0: 6938, 3.0: 772, 4.0: 2084, 2.0: 250, 1.0: 217})

In [22]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [23]:
%%time
embedding_dim = 300
embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

Loading Glove Model


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Done. 400000  words loaded!
CPU times: user 32.5 s, sys: 1.16 s, total: 33.6 s
Wall time: 33.7 s


In [9]:
n_topics = 5
n_sentiment = 5
alpha = 0.1/n_topics * np.ones(n_topics)
beta = .01
gamma = 10
gamma = [gamma/(n_topics*n_sentiment)]*n_sentiment

maxiter = 20
lambda_param = 1

In [10]:
gamma

[0.4, 0.4, 0.4, 0.4, 0.4]

In [11]:
imp.reload(lda)

<module 'ELJST_script_unigram' from '/home/william18026/Embedding-LJST/ELJST_script_unigram.py'>

In [12]:
sampler = lda.SentimentLDAGibbsSampler(n_topics, alpha, beta, gamma, numSentiments=n_sentiment, minlabel = 0, 
                                       maxlabel = 5, SentimentRange = 5, max_df = .5, min_df = 5, 
                                       lambda_param = lambda_param)

In [13]:
sampler._initialize_(reviews = dataset.text.tolist(), labels = dataset.overall.tolist())

In [16]:
sampler.wordOccuranceMatrix.sum()

383692

In [24]:
%%time
word_embeddings = []

for word in tqdm(sampler.words):
    emb = embeddings_index.get(word, np.array([0]*embedding_dim))
    word_embeddings.append(emb)

word_embeddings = np.array(word_embeddings)

HBox(children=(IntProgress(value=0, max=4386), HTML(value='')))


CPU times: user 110 ms, sys: 27.8 ms, total: 137 ms
Wall time: 132 ms


In [25]:
cutoff = .5
word_similarity = cosine_similarity(word_embeddings)
word_similarity = word_similarity > cutoff
word_similarity = word_similarity.astype(int)
np.fill_diagonal(word_similarity, 0)

In [26]:
wordOccuranceMatrixBinary = sampler.wordOccuranceMatrix.copy()
wordOccuranceMatrixBinary[wordOccuranceMatrixBinary > 1] = 1
wordOccuranceMatrixBinary.shape

(10261, 4386)

In [27]:
np.sum(wordOccuranceMatrixBinary)

299596

In [28]:
Counter(np.array([i[1] for i in nltk.pos_tag(sampler.words)]))

Counter({'CD': 52,
         'NN': 2435,
         'IN': 45,
         'JJ': 1032,
         'RB': 288,
         'VBP': 177,
         'VB': 51,
         'NNS': 64,
         'VBD': 69,
         'VBZ': 25,
         'VBN': 13,
         'RBS': 1,
         'RBR': 10,
         'JJR': 33,
         'JJS': 28,
         'DT': 1,
         'MD': 4,
         'VBG': 19,
         'FW': 12,
         'WP': 1,
         'NNP': 21,
         'CC': 2,
         'UH': 1,
         'RP': 1,
         'WDT': 1})

In [29]:
pp = np.array([i[1] for i in nltk.pos_tag(sampler.words)])
pp[pp=='JJ'] = 1
pp[pp=='JJR'] = 1
pp[pp=='JJS'] = 1
pp[pp=='NN'] = 1
pp[pp=='NNS'] = 1
pp[pp=='NNP'] = 1
pp[pp=='NNPS'] = 1
pp[pp!='1'] = 0
pp = pp.astype(int)

In [30]:
wordOccuranceMatrixBinary[np.where(pp!=1)[0], :] = 0
wordOccuranceMatrixBinary[:, np.where(pp!=1)[0]] = 0
np.sum(wordOccuranceMatrixBinary)

219093

In [None]:
def get_edges(i):
    t = np.where(i==0)[0]
    k = word_similarity.copy()

    k[t, :] = 0
    k[:, t] = 0

    z, x = np.where(k==1)
    embeds = {}
    for p, q in zip(z, x):
        try:
            embeds[p] += [q]
        except:
            embeds[p] = [q]
        try:
            embeds[q] += [p]
        except:
            embeds[q] = [p]
    for i in embeds.keys():
        embeds[i] = list(set(embeds[i]))
    return embeds

In [None]:
# %%time
# pool = multiprocessing.Pool(n_cores)
# similar_words = pool.map(get_edges, wordOccuranceMatrixBinary)
# pool.close()
# pickle_out = open("resources/amazon_muiscal_glove_0.5_POS.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()

In [17]:
similar_words = pickle.load(open("resources/amazon_muiscal_glove_0.3_POS.pickle","rb"))

In [18]:
sampler.run(reviews=dataset.text.tolist(), labels=dataset.overall.tolist(), 
            similar_words=similar_words, mrf=True, maxIters=0)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [32]:
Counter([len(i) for i in sampler.docs_edges])

Counter({0: 956,
         14: 299,
         26: 237,
         62: 83,
         82: 56,
         20: 264,
         36: 145,
         16: 295,
         22: 226,
         6: 327,
         4: 302,
         40: 112,
         72: 61,
         12: 306,
         38: 152,
         24: 201,
         46: 98,
         222: 11,
         92: 41,
         8: 318,
         2: 281,
         30: 173,
         10: 306,
         212: 19,
         304: 9,
         54: 102,
         42: 132,
         52: 85,
         100: 41,
         58: 85,
         28: 171,
         60: 81,
         132: 30,
         78: 49,
         18: 262,
         32: 164,
         74: 57,
         34: 162,
         174: 17,
         250: 10,
         64: 68,
         450: 1,
         162: 18,
         116: 23,
         104: 41,
         70: 63,
         176: 25,
         66: 88,
         48: 88,
         624: 2,
         1756: 1,
         50: 87,
         704: 2,
         828: 3,
         136: 21,
         160: 24,
         166: 19,

In [None]:
plt.plot(sampler.loglikelihood_history)

In [None]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),
                 sampler.dt_distribution.argmax(axis=1), metric='precomputed')

In [None]:
davies_bouldin_score(sampler.wordOccuranceMatrix, sampler.dt_distribution.argmax(axis=1))

In [None]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, list(sampler.getTopKWords(5).values()), sampler.vocabulary)

In [None]:
%%time
my_utils.get_hscore_multi(sampler.dt_distribution, sampler.wordOccuranceMatrix, n_topics, 3000)

In [21]:
np.exp(-sampler.loglikelihood()/sampler.wordOccuranceMatrix.sum())

2793.3712108451223

In [None]:
joblib.dump(sampler, "resources/sampler_20iter_0.5_1")

### Appendix

In [None]:
# pickle_out = open("resources/amazon_muiscal_glove_0.4.pickle","wb")
# pickle.dump(similar_words, pickle_out)
# pickle_out.close()