#### Dependencies

In [1]:
# !pip install numpy num2words nltk pandas Observations gensim
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# !pip install -U textblob
# !python -m textblob.download_corpora

#### Imports

In [2]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime

import numpy as np
import pandas as pd
import LDA_ILJST as lda
import matplotlib.pyplot as plt

In [3]:
imp.reload(lda)

<module 'LDA_ILJST' from 'LDA_ILJST.pyc'>

### Preprocessing Methods

In [4]:
def convert_numbers(k):
    for i in range(len(k)):
        try:
            num2words(int(k[i]))
            k[i] = " "
        except:
            pass
    return k

def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\t'), ' ')
    pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: convert_numbers(x))
    pd = pd.str.join(' ')
    
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])    
    pd = pd.apply(lambda x: [item for item in x if item not in stop_words])
    return pd

def processReviews(reviews, window=5, MAX_VOCAB_SIZE=2000):
    vectorizer = CountVectorizer(analyzer="word",tokenizer=None, max_df=0.7, max_features=MAX_VOCAB_SIZE)
    count_matrix = vectorizer.fit_transform(reviews)
    tfidf_vectorizer = TfidfVectorizer(max_features=MAX_VOCAB_SIZE, max_df=0.7)
    tfidf_matrix = tfidf_vectorizer.fit_transform(reviews)
    words = vectorizer.get_feature_names()
    vocabulary = dict(zip(words,np.arange(len(words))))
    inv_vocabulary = dict(zip(np.arange(len(words)),words))
    return count_matrix.toarray(), tfidf_matrix.toarray(), vocabulary, words

### Score Methods

In [5]:
def coherence_score(X, topic_sentiment_df):
    X[X>1] = 1    
    totalcnt = len(topic_sentiment_df)
    total = 0
    for allwords in topic_sentiment_df:
        for word1 in allwords:
            for word2 in allwords:
                if word1 != word2:
                    ind1 = vocabulary[word1]
                    ind2 = vocabulary[word2]
                    total += np.log((np.matmul(X[:,ind1].T, X[:,ind2]) + 1.0)/np.sum(X[:,ind2]))
    return total/(2*totalcnt)

In [6]:
def kl_score(pk,qk):
    return (scipy.stats.entropy(pk,qk)*.5 + scipy.stats.entropy(qk,pk)*.5)

In [7]:
def get_hscore(dt_distribution, X, k):
    testlen = X.shape[0]
    all_kl_scores = np.zeros((testlen, testlen))
    for i in range(testlen-1):
        for j in range(i+1,testlen):
            score = kl_score(dt_distribution[i],dt_distribution[j])
            all_kl_scores[i,j] = score
            all_kl_scores[j,i] = score

    dt = np.zeros((X.shape[0], k))

    for i in range(X.shape[0]):
        dt[i, dt_distribution[i].argmax()]=1

    intradist = 0
    for i in range(k):
        cnt = dt[:,i].sum()
        tmp = np.outer(dt[:,i],dt[:,i])
        tmp = tmp * all_kl_scores
        intradist += tmp.sum()*1.0/(cnt*(cnt-1))
    intradist = intradist/k

    interdist = 0
    for i in range(k):
       for j in range(k):
           if i != j:
             cnt_i = dt[:,i].sum()
             cnt_j = dt[:,j].sum()
             tmp = np.outer(dt[:,i], dt[:,j])
             tmp = tmp * all_kl_scores
             interdist += tmp.sum()*1.0/(cnt_i*cnt_j)
    interdist = interdist/(k*(k-1))
    return intradist/interdist

### Required Methods

In [8]:
def get_cosine(a, b):
    return 1 - spatial.distance.cosine(a, b)

In [9]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [10]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

### Read Data

In [11]:
%%time
embeddings_index = loadGloveModel("nongit_resources/glove.42B.300d.txt")

Loading Glove Model
('Done.', 1917494, ' words loaded!')
CPU times: user 2min 56s, sys: 7.58 s, total: 3min 3s
Wall time: 3min 3s


In [12]:
N_docs = 5000

In [13]:
dataset = pd.read_pickle("resources/reviews_Home_and_Kitchen_5_5k_pd")
dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
reviews = dataset['reviewText'].values
ratings = dataset['overall'].values[:N_docs]

In [14]:
len(ratings)

5000

In [15]:
stop_words = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()

In [16]:
dataset[8] = preprocess(dataset['reviewText'])
dataset[9] = dataset[8].apply(lambda x: " ".join(x))

In [17]:
dataset.head(2)

Unnamed: 0,asin,helpful,overall,reviewText,8,9
0,615391206,"[0, 0]",5.0,My daughter wanted this book and the price on ...,"[daughter, wanted, book, price, amazon, wa, be...",daughter wanted book price amazon wa best ha a...
1,615391206,"[0, 0]",5.0,I bought this zoku quick pop for my daughterr ...,"[bought, zoku, quick, pop, daughterr, zoku, qu...",bought zoku quick pop daughterr zoku quick mak...


In [18]:
count_matrix, tfidf_matrix, vocabulary, words = processReviews(dataset[9].values)

In [19]:
count_matrix.shape, tfidf_matrix.shape

((5000, 2000), (5000, 2000))

### Find Edges

In [20]:
pickle_in = open("resources/docs_edges_home_5k.pickle","rb")
docs_edges = pickle.load(pickle_in)

In [21]:
edge_dict = {}
for i in docs_edges:
    for j in i:
        try:
            edge_dict[j[0]] += [j[1]]
            edge_dict[j[1]] += [j[0]]
        except:
            edge_dict[j[0]] = [j[1]]
            edge_dict[j[1]] = [j[0]]

In [22]:
for i in edge_dict.keys():
    edge_dict[i] = list(set(edge_dict[i]))

In [23]:
len(edge_dict)

352

## Run Model

In [29]:
maxiter = 20
lambda_param = 1.0
N_ITERATAIONS = 5
N_SENTIMENT = 5
k = N_TOPICS = 5

In [30]:
imp.reload(lda)

<module 'LDA_ILJST' from 'LDA_ILJST.pyc'>

In [31]:
kf = KFold(n_splits=5)

In [32]:
date_time = str(datetime.datetime.now())
os.mkdir("dumps/"+date_time)

df_matrix = scipy.sparse.load_npz('resources/df_home_sparse_5k.npz')

fold = 0
for train_index, test_index in kf.split(count_matrix):
    X_train, X_test = count_matrix[train_index], count_matrix[test_index]
    y_train, y_test = ratings[train_index], ratings[test_index]
    
    sampler = lda.LdaSampler(n_sentiment = N_SENTIMENT, n_topics=N_TOPICS, lambda_param=lambda_param)
    sampler.store_data(X_train, y_train, X_test, y_test, df_matrix, words, edge_dict)
    
    likelihood_history = []
    print("")
    print("fold:", fold)

    for i in range(N_ITERATAIONS):
        print(datetime.datetime.now().time(), " iteration:", i)
        sampler.run(X_train, y_train, X_test, edge_dict, maxiter=maxiter)
        lk_now = sampler.loglikelihood(docs_edges)
        likelihood_history.append(lk_now)
        print(lk_now)
        joblib.dump(sampler, 'dumps/' + date_time + '/sampler_home_5k_fold_' + str(fold) + "_maxiter_"
                    + str(maxiter) + "_iter_" + str(i) +"_in_" + str(N_ITERATAIONS))
        
    pd.DataFrame(likelihood_history).to_csv('dumps/' + date_time + "/likelihood_history_home_fold_" +
                                            str(fold) + '.txt', header=None)
    
    fold += 1


('fold:', 0)
(datetime.time(9, 26, 11, 683287), ' iteration:', 0)
-1590952.5664538702
(datetime.time(9, 32, 26, 587684), ' iteration:', 1)
-1597463.7130789987
(datetime.time(9, 38, 37, 375327), ' iteration:', 2)
-1589945.1111659922
(datetime.time(9, 44, 50, 750254), ' iteration:', 3)
-1587959.7227113906
(datetime.time(9, 51, 2, 633023), ' iteration:', 4)
-1589218.1883371684

('fold:', 1)
(datetime.time(9, 57, 19, 190927), ' iteration:', 0)
-1566258.1511642437
(datetime.time(10, 4, 1, 402211), ' iteration:', 1)
-1568810.7159251615
(datetime.time(10, 10, 38, 778922), ' iteration:', 2)
-1573077.7770527145
(datetime.time(10, 23, 58, 342967), ' iteration:', 4)
-1566883.1997949232

('fold:', 2)
(datetime.time(10, 30, 35, 524156), ' iteration:', 0)
-1549967.6383213208
(datetime.time(10, 37, 20, 395349), ' iteration:', 1)
-1552903.9873292781
(datetime.time(10, 44, 4, 356142), ' iteration:', 2)
-1551938.6838152695
(datetime.time(10, 50, 49, 953331), ' iteration:', 3)
-1552033.725998888
(dateti

# Evaluation

In [None]:
sampler = joblib.load("dumps/2019-07-27 06:40:21.499611/sampler_electronics_5k_fold_0_maxiter_1_iter_0_in_2")

In [None]:
t_words = sampler.getTopKWords(5, sampler.words)
top_words = [t_words[i] for i in t_words.keys()]
document_topic = sampler.theta().argmax(axis=1)

In [None]:
t_words

In [None]:
coherence_score(sampler.get_count_matrix(), top_words)

In [None]:
%%time
get_hscore(sampler.theta(), sampler.get_count_matrix(), sampler.n_topics)

In [None]:
top_words

# Entropy Calculations

In [None]:
idf_matrix = (sampler.get_df_matrix() > 0).astype(int)

In [None]:
word_freq = idf_matrix.sum(axis=0)

In [None]:
idf_matrix = idf_matrix * word_freq
idf_matrix = idf_matrix * 1.0/idf_matrix.shape[0]
document_topic_entropy = scipy.stats.entropy(sampler.nmz.transpose())

In [None]:
word_len = sampler.get_count_matrix().sum(axis=1)

In [None]:
print (np.corrcoef(document_topic_entropy, word_len))

In [None]:
document_popularity = np.true_divide(idf_matrix.sum(1),(idf_matrix != 0).sum(1)) #scipy.stats.mstats.gmean(idf_matrix,axis=1)
print (np.corrcoef(document_topic_entropy, document_popularity))

In [None]:
normalized_dts = sampler.nmzs * sampler.nmz[:,:,np.newaxis]
normalized_dts /= normalized_dts.sum(axis=-1)[:,:,np.newaxis]

In [None]:
document_topic_sentiment_crossentropy = np.array([[scipy.stats.entropy(j) for j in i] for i in normalized_dts])
document_ts_entropy_min = document_topic_sentiment_crossentropy.min(axis=1)
document_ts_entropy_mean = document_topic_sentiment_crossentropy.mean(axis=1)
document_ts_entropy_var = np.sqrt(document_topic_sentiment_crossentropy.var(axis=1))

In [None]:
document_ts_entropy_var

In [None]:
# idf_matrix = (sampler.wordOccuranceMatrix > 0).astype(int)
# word_freq = idf_matrix.sum(axis=0)
# idf_matrix = idf_matrix * word_freq
# idf_matrix = idf_matrix * 1.0/idf_matrix.shape[0]
# document_topic_entropy = scipy.stats.entropy(sampler.dt_distribution.transpose())
# word_len = sampler.wordOccuranceMatrix.sum(axis=1)
# print (np.corrcoef(document_topic_entropy, word_len))
# #document_topic_entropy_len_normalized = document_topic_entropy * np.sqrt(word_len/2)
# document_popularity = np.true_divide(idf_matrix.sum(1),(idf_matrix != 0).sum(1)) #scipy.stats.mstats.gmean(idf_matrix,axis=1)
# print (np.corrcoef(document_topic_entropy, document_popularity))
# normalized_dts = sampler.dts_distribution * sampler.dt_distribution[:,:,np.newaxis]
# normalized_dts /= normalized_dts.sum(axis=-1)[:,:,np.newaxis]
# document_topic_sentiment_crossentropy = np.array([[scipy.stats.entropy(j) for j in i] for i in normalized_dts])
# document_ts_entropy_min = document_topic_sentiment_crossentropy.min(axis=1)
# document_ts_entropy_mean = document_topic_sentiment_crossentropy.mean(axis=1)
# document_ts_entropy_var = np.sqrt(document_topic_sentiment_crossentropy.var(axis=1))

# Appendix

In [None]:
# %%time
# edges_threshold = 0.8
# docs_edges, ignored, taken, count = [], [], [], 0
# for idx, doc in enumerate(dataset[8].values):
#     edges = []
#     print(idx)
#     for i in doc:
#         for j in doc:
#             if i != j:
#                 try:
#                     a = embeddings_index[i]
#                     b = embeddings_index[j]
#                     if get_cosine(a, b) > edges_threshold and (vocabulary[i], vocabulary[j]) not in edges and (vocabulary[j], vocabulary[i]) not in edges:
#                         edges.append((vocabulary[i], vocabulary[j]))
#                 except:
#                     try:
#                         embeddings_index[i]
#                         taken.append(i)
#                     except:
#                         ignored.append(i)
#                     try:
#                         embeddings_index[j]
#                     except:
#                         ignored.append(j)
#                         taken.append(j)
#                     pass
#     docs_edges.append(edges)

In [None]:
# pickle_out = open("resources/docs_edges_home_5k.pickle","wb")
# pickle.dump(docs_edges, pickle_out)
# pickle_out.close()

In [None]:
## Sentence wise
# dataset = dataset[['asin', 'helpful', 'overall', 'reviewText']]
# dataset['n_words'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x)))
# dataset['sentences'] = dataset['reviewText'].apply(lambda x: [i.strip() for i in x.split(".")])
# dataset['sentence_word_density'] = dataset['reviewText'].apply(lambda x: len(w_tokenizer.tokenize(x))/ len(x.split(".")))
# dataset.to_csv("reviews_Musical_Instruments_5.csv")

In [None]:
# edge_dict, ignored, taken, count = {}, [], [], 0
# for idxi, i in enumerate(vocabulary.keys()):
#     print(idxi)
#     for idxj, j in enumerate(vocabulary.keys()):
#         if i != j:
#             try:
#                 a = embeddings_index[i]
#                 b = embeddings_index[j]
#                 if get_cosine(a, b) > edges_threshold:
#                     try:
#                         edge_dict[vocabulary[i]] += [vocabulary[j]]
#                         edge_dict[vocabulary[j]] += [vocabulary[i]]
#                     except:
#                         edge_dict[vocabulary[i]] = [vocabulary[j]]
#                         edge_dict[vocabulary[j]] = [vocabulary[i]]
#             except:
#                 try:
#                     embeddings_index[i]
#                     taken.append(i)
#                 except:
#                     ignored.append(i)
#                 try:
#                     embeddings_index[j]
#                 except:
#                     ignored.append(j)
#                     taken.append(j)
#                 pass

In [None]:
# df = {}

# for idx, i in enumerate(dataset[8].values):
#     print(idx)
#     for j in i:
#         try:
#             df[j] += [idx]
#         except:
#             df[j] = [idx]

# for i in df.keys():
#     df[i] = len(list(set(df[i])))

# df_vector = []
# for i in dataset[8].values:
#     d = [0]*len(vocabulary.keys())
#     for j in i:
#         if j in vocabulary.keys():
#             d[vocabulary[j]] = df[j]
#     df_vector.append(d)

# csr = sparse.csr_matrix(np.array(df_vector))
# scipy.sparse.save_npz('resources/df_movies_5k.npz', csr)
# np.array(scipy.sparse.load_npz('resources/df_movies_5k.npz').todense())

In [None]:
# dataset = parse("nongit_resources/reviews_Electronics_5.json.gz")
# dataset = pd.DataFrame(list(dataset))
# dataset = dataset.head(N_docs)
# dataset.to_pickle("resources/reviews_Electronics_5")