#### Imports

In [1]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime

import numpy as np
import pandas as pd
#import LDA_ELJST as lda
#import ELJST_script_unigram as lda
#import LJST_script_BTM as lda
import ELJST_script_BTM as lda
import matplotlib.pyplot as plt



In [2]:
import utils as my_utils

In [3]:
st = PorterStemmer()

### Read Data

In [4]:
dataset = pd.read_csv("musical_review.csv",engine='python') #pd.read_csv('stf_50k.csv')
dataset.sentiment_score = dataset.sentiment_score.astype(int)
dataset["clean_sentence"] = dataset["clean_sentence"].apply(lambda x: " ".join([st.stem(i) for i in x.split()]))

In [5]:
dataset.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment_score,clean_sentence,wordlen
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5,good,1393545600,"02 28, 2014",5,much write exactli suppo filter pop sound reco...,21
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5,Jake,1363392000,"03 16, 2013",5,product exactli quit affordablei realiz doubl ...,45


In [6]:
dataset.sentiment_score.value_counts()

5    5856
4    1658
3     605
2     189
1     176
Name: sentiment_score, dtype: int64

In [7]:
maxiter = 10
lambda_param = 1
N_SENTIMENT = 5
n_topics = 5
alpha = 0.1/n_topics * np.ones(n_topics)
gamma = 10
gamma = [gamma/(n_topics*N_SENTIMENT)]*N_SENTIMENT
beta = .01

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

lda2 = LatentDirichletAllocation(n_topics=n_topics)

In [9]:
from tqdm import tqdm

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [10]:
embedding_dim = 200

In [11]:
%%time
embeddings_index = loadGloveModel('C:/Users/asengup6/Documents/models/glove.6B.{}d.txt'.format(embedding_dim))

Loading Glove Model


400000it [01:21, 4923.15it/s]


Done. 400000  words loaded!
Wall time: 1min 21s


### LJST Unigram model

In [58]:
sampler = lda.SentimentLDAGibbsSampler(n_topics, alpha, beta, gamma, numSentiments=N_SENTIMENT, minlabel = 0, 
                                       maxlabel = 5, SentimentRange = 5, max_df = .5, min_df = 5, lambda_param = 0.8)

In [59]:
sampler._initialize_(reviews=list(dataset.clean_sentence), labels=list(dataset.sentiment_score), unlabeled_reviews=[],skipgramwindow=5)

In [60]:
sampler.wordOccuranceMatrix.shape

(8484, 2723)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [16]:
'''
word_similarity ={}
words_without_emb = 0
cutoff = .5

for i in tqdm(range(sampler.wordOccuranceMatrix.shape[0])):
    words_embeddings = []
    for val in list(np.where(sampler.wordOccuranceMatrix[i] > 0)[0]):
        word = sampler.vectorizer.get_feature_names()[val]

        if len(word.split()) == 1:
            emb = embeddings_index.get(word,np.array([0]*embedding_dim))
        else:
            emb = np.array([0]*embedding_dim)
            count = 0
            for w in word.split():
                if w in embeddings_index:
                    count += 1
                emb = emb + embeddings_index.get(w,np.array([0]*embedding_dim))
            if count != 0:
                emb = emb/count

        words_embeddings.append(emb)
        
    words_embeddings = cosine_similarity(np.array(words_embeddings))    
    words_embeddings = words_embeddings > cutoff
    words_embeddings = words_embeddings.astype(int)
    
    word_similarity[i] = words_embeddings
    
pickle.dump(word_similarity,open('word_similarity_amazon_musical5_cutoff.pkl','wb'))
'''

"\nword_similarity ={}\nwords_without_emb = 0\ncutoff = .5\n\nfor i in tqdm(range(sampler.wordOccuranceMatrix.shape[0])):\n    words_embeddings = []\n    for val in list(np.where(sampler.wordOccuranceMatrix[i] > 0)[0]):\n        word = sampler.vectorizer.get_feature_names()[val]\n\n        if len(word.split()) == 1:\n            emb = embeddings_index.get(word,np.array([0]*embedding_dim))\n        else:\n            emb = np.array([0]*embedding_dim)\n            count = 0\n            for w in word.split():\n                if w in embeddings_index:\n                    count += 1\n                emb = emb + embeddings_index.get(w,np.array([0]*embedding_dim))\n            if count != 0:\n                emb = emb/count\n\n        words_embeddings.append(emb)\n        \n    words_embeddings = cosine_similarity(np.array(words_embeddings))    \n    words_embeddings = words_embeddings > cutoff\n    words_embeddings = words_embeddings.astype(int)\n    \n    word_similarity[i] = words_embed

In [17]:
word_embeddings = []

for word in tqdm(sampler.vectorizer.get_feature_names()):
    emb = embeddings_index.get(word,np.array([0]*embedding_dim))
    word_embeddings.append(emb)
    
word_embeddings = np.array(word_embeddings)

100%|███████████████████████████████████████████████████████████████████████████| 2723/2723 [00:00<00:00, 26767.72it/s]


In [18]:
word_similarity = cosine_similarity(word_embeddings)

In [19]:
word_similarity

array([[ 1.        , -0.03067062, -0.04051867, ...,  0.06807656,
        -0.02699289,  0.16327779],
       [-0.03067062,  1.        , -0.06469126, ...,  0.02309567,
        -0.06563686,  0.04846905],
       [-0.04051867, -0.06469126,  1.        , ...,  0.08963072,
         0.01120404,  0.05127578],
       ...,
       [ 0.06807656,  0.02309567,  0.08963072, ...,  1.        ,
        -0.01658845,  0.19891084],
       [-0.02699289, -0.06563686,  0.01120404, ..., -0.01658845,
         1.        ,  0.05814918],
       [ 0.16327779,  0.04846905,  0.05127578, ...,  0.19891084,
         0.05814918,  1.        ]])

In [20]:
cutoff = .5

word_similarity = word_similarity > cutoff
word_similarity = word_similarity.astype(int)

In [None]:
sampler.run(reviews=list(dataset.clean_sentence), labels=list(dataset.sentiment_score), unlabeled_reviews=[], similar_words = word_similarity, mrf = True, maxIters=maxiter)


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]

Starting iteration 1 of 10


 10%|████████                                                                        | 1/10 [08:40<1:18:06, 520.73s/it]

Starting iteration 2 of 10


 20%|████████████████                                                                | 2/10 [17:56<1:10:50, 531.29s/it]

Starting iteration 3 of 10


 30%|████████████████████████                                                        | 3/10 [26:52<1:02:09, 532.74s/it]

Starting iteration 4 of 10


 40%|████████████████████████████████▊                                                 | 4/10 [35:52<53:28, 534.76s/it]

Starting iteration 5 of 10


 50%|█████████████████████████████████████████                                         | 5/10 [44:42<44:26, 533.30s/it]

Starting iteration 6 of 10


In [22]:
sampler.conditionalDistribution(0,0,word_similarity,True,True)

[[1.19237411e-01 9.44232487e-05 3.07533036e-04 4.08698037e-04
  8.79083797e-01]
 [2.54317927e-05 2.30507799e-06 1.01918748e-05 2.79899808e-05
  1.57428028e-04]
 [8.43027294e-06 8.53239889e-06 1.54516468e-05 1.28547451e-05
  2.04802440e-04]
 [2.18392528e-05 3.45789587e-05 3.92078662e-09 4.40688525e-06
  1.97935844e-04]
 [2.07957036e-05 1.02152325e-05 2.21329706e-05 5.81800780e-06
  7.69927485e-05]] [[1.11748361 1.00004079 1.00004079 1.00004079 1.00004079]
 [1.         1.         1.00004079 1.00004079 1.11748361]
 [1.11748361 1.         1.         1.11743803 1.00004079]
 [1.11748361 1.11748361 1.00004079 1.11743803 1.        ]
 [1.00004079 1.         1.11743803 1.00004079 1.11748361]] [[1.11622128 1.0014951  1.0014951  1.0014951  1.0014951 ]
 [1.         1.         1.0014951  1.0014951  1.11622128]
 [1.11622128 1.         1.         1.11455491 1.0014951 ]
 [1.11622128 1.11622128 1.0014951  1.11455491 1.        ]
 [1.0014951  1.         1.11455491 1.0014951  1.11622128]]
[[1.44269592e-01 

array([[1.44269592e-01, 9.17310346e-05, 2.98764594e-04, 3.97045158e-04,
        8.54019188e-01],
       [2.46687863e-05, 2.23592088e-06, 9.90128204e-06, 2.71919250e-05,
        1.90477782e-04],
       [1.02000877e-05, 8.27640927e-06, 1.49880654e-05, 1.55295615e-05,
        1.98963073e-04],
       [2.64240903e-05, 4.18383145e-05, 3.80899639e-09, 5.32387029e-06,
        1.91997359e-04],
       [2.02027725e-05, 9.90875440e-06, 2.67384009e-05, 5.65212362e-06,
        9.31562703e-05]])

In [50]:
sampler.n_dt

array([[  0.,   0.,   0.,  76.,   0.],
       [107.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0., 192.,   0.],
       ...,
       [  0.,   0., 110.,   0.,   0.],
       [  0., 170.,   0.,   0.,   0.],
       [  0., 121.,   1.,   0.,   0.]])

In [51]:
sampler.dt_distribution

array([[2.62812089e-04, 2.62812089e-04, 2.62812089e-04, 9.98948752e-01,
        2.62812089e-04],
       [9.99253035e-01, 1.86741363e-04, 1.86741363e-04, 1.86741363e-04,
        1.86741363e-04],
       [1.04112441e-04, 1.04112441e-04, 1.04112441e-04, 9.99583550e-01,
        1.04112441e-04],
       ...,
       [1.81653043e-04, 1.81653043e-04, 9.99273388e-01, 1.81653043e-04,
        1.81653043e-04],
       [1.17577895e-04, 9.99529688e-01, 1.17577895e-04, 1.17577895e-04,
        1.17577895e-04],
       [1.63800164e-04, 9.91154791e-01, 8.35380835e-03, 1.63800164e-04,
        1.63800164e-04]])

In [52]:
sampler.n_dts

array([[[  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,  76.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  0.,   0., 105.,   0.,   2.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   5., 111.,  76.,   0.],
        [  0.,   0.,   0.,   0.,   0.]],

       ...,

       [[  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   5.,   5., 100.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.]],

       [[  0.,   0.,   0.,   0.,   0.],
        [  0., 170.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.,   0.,   0.,   0.],
        [  0.,   0.

In [53]:
sampler.dts_distribution

array([[[1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.04166667e-03, 9.90625000e-01, 1.04166667e-03, 1.04166667e-03,
         6.25000000e-03],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01]],

       [[7.42115028e-04, 7.42115028e-04, 9.74768089e-01, 7.42115028e-04,
         2.30055659e-02],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01],
        [1.00000000e-01, 1.00000000e-01, 1.00000000e-01, 1.00000000e-01,
         6.00000000e-01]],

      

In [27]:
lda2.fit(sampler.wordOccuranceMatrix)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=5, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [28]:
dt = lda2.transform(sampler.wordOccuranceMatrix)

In [29]:
dt

array([[0.9569813 , 0.01060561, 0.01078143, 0.0108171 , 0.01081457],
       [0.70973751, 0.00638399, 0.27112138, 0.00639147, 0.00636564],
       [0.82370132, 0.04960377, 0.11618825, 0.00526918, 0.00523747],
       ...,
       [0.0065411 , 0.26133197, 0.00662374, 0.00664748, 0.71885572],
       [0.00533946, 0.26301878, 0.0735857 , 0.00539578, 0.65266028],
       [0.09207113, 0.15719974, 0.0055186 , 0.00553124, 0.73967929]])

In [30]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [31]:
Counter(sampler.dt_distribution.argmax(axis=1))

Counter({0: 1714, 1: 1601, 2: 1762, 3: 1606, 4: 1801})

In [32]:
Counter(dt.argmax(axis=1))

Counter({0: 2275, 1: 2152, 2: 604, 3: 1380, 4: 2073})

In [33]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

In [34]:
cosine_doc = cosine_distances(sampler.wordOccuranceMatrix)
cosine_doc.shape

(8484, 8484)

In [35]:
cosine_doc

array([[0.        , 0.61745397, 0.78932476, ..., 0.91507922, 0.9729631 ,
        0.97745826],
       [0.61745397, 0.        , 0.81578748, ..., 0.9783426 , 0.97931428,
        0.91376747],
       [0.78932476, 0.81578748, 0.        , ..., 0.93866044, 0.96094167,
        0.96743552],
       ...,
       [0.91507922, 0.9783426 , 0.93866044, ..., 0.        , 0.61427286,
        0.47932069],
       [0.9729631 , 0.97931428, 0.96094167, ..., 0.61427286, 0.        ,
        0.69283273],
       [0.97745826, 0.91376747, 0.96743552, ..., 0.47932069, 0.69283273,
        0.        ]])

In [54]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),sampler.dt_distribution.argmax(axis=1),metric='precomputed')

-0.0030386771795631695

In [37]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),dt.argmax(axis=1),metric='precomputed')

-0.0043106057580392775

In [55]:
davies_bouldin_score(sampler.wordOccuranceMatrix,sampler.dt_distribution.argmax(axis=1))

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


48.8862831735247

In [39]:
davies_bouldin_score(sampler.wordOccuranceMatrix,dt.argmax(axis=1))

  score = (intra_dists[:, None] + intra_dists) / centroid_distances


8.588950650541125

In [56]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, list(sampler.getTopKWords(5).values()), sampler.vocabulary)

-14.049816459458132

In [57]:
my_utils.get_hscore(sampler.dt_distribution[:1000],sampler.wordOccuranceMatrix[:1000],n_topics)

0.0466363716258515

In [42]:
def print_top_words(model, feature_names, n_top_words):
    output = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        output.append([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        #print(message)
    return output

In [43]:
tf_feature_names = sampler.vectorizer.get_feature_names()

In [44]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, print_top_words(lda2, tf_feature_names, 5), sampler.vocabulary)

-16.775766438285192

In [45]:
my_utils.get_hscore(dt[:1000],sampler.wordOccuranceMatrix[:1000],n_topics)

0.19994489337908955

In [6]:
#count_matrix, _, vocabulary, words = my_utils.processReviews(dataset['clean_sentence'].values)

In [65]:
cosine_similarity(embeddings_index["turkey"].reshape(1, -1),embeddings_index["fridge"].reshape(1, -1))

array([[0.06756489]])

In [71]:
cosine_similarity(embeddings_index["spaceship"].reshape(1, -1),embeddings_index["fridge"].reshape(1, -1))

array([[0.10478221]])