#### Imports

In [2]:
from scipy import spatial, sparse
from scipy.stats import chi2
from collections import Counter
from num2words import num2words
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.externals import joblib 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import os
import imp
import gzip
import copy
import nltk
import pickle
import scipy
import string
import gensim
import operator
import datetime

import numpy as np
import pandas as pd
#import LDA_ELJST as lda
import ELJST_script_unigram as lda
#import LJST_script_BTM as lda
#import ELJST_script_BTM as lda
import matplotlib.pyplot as plt



In [2]:
import utils as my_utils

In [None]:
st = PorterStemmer()

### Read Data

In [None]:
dataset = pd.read_csv("musical_review.csv",engine='python') #pd.read_csv('stf_50k.csv')
dataset.sentiment_score = dataset.sentiment_score.astype(int)
dataset["clean_sentence"] = dataset["clean_sentence"].apply(lambda x: " ".join([st.stem(i) for i in x.split()]))

In [None]:
dataset.head(2)

In [None]:
dataset.sentiment_score.value_counts()

In [3]:
maxiter = 10
lambda_param = 1
N_SENTIMENT = 5
n_topics = 5
alpha = 0.1/n_topics * np.ones(n_topics)
gamma = 10
gamma = [gamma/(n_topics*N_SENTIMENT)]*N_SENTIMENT
beta = .01

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda2 = LatentDirichletAllocation(n_topics=n_topics)

In [3]:
from tqdm import tqdm

def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r',encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [4]:
embedding_dim = 200

In [5]:
%%time
embeddings_index = loadGloveModel('nongit_resources/glove.6B.300d.txt')

1891it [00:00, 9343.84it/s]

Loading Glove Model


30866it [00:03, 9361.01it/s]


KeyboardInterrupt: 

### LJST Unigram model

In [None]:
sampler = lda.SentimentLDAGibbsSampler(n_topics, alpha, beta, gamma, numSentiments=N_SENTIMENT, minlabel = 0, 
                                       maxlabel = 5, SentimentRange = 5, max_df = .5, min_df = 5, 
                                       lambda_param = lambda_param)

In [None]:
sampler._initialize_(reviews=list(dataset.clean_sentence), labels=list(dataset.sentiment_score), unlabeled_reviews=[])

In [None]:
sampler.wordOccuranceMatrix.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
'''
word_similarity ={}
words_without_emb = 0
cutoff = .5

for i in tqdm(range(sampler.wordOccuranceMatrix.shape[0])):
    words_embeddings = []
    for val in list(np.where(sampler.wordOccuranceMatrix[i] > 0)[0]):
        word = sampler.vectorizer.get_feature_names()[val]

        if len(word.split()) == 1:
            emb = embeddings_index.get(word,np.array([0]*embedding_dim))
        else:
            emb = np.array([0]*embedding_dim)
            count = 0
            for w in word.split():
                if w in embeddings_index:
                    count += 1
                emb = emb + embeddings_index.get(w,np.array([0]*embedding_dim))
            if count != 0:
                emb = emb/count

        words_embeddings.append(emb)
        
    words_embeddings = cosine_similarity(np.array(words_embeddings))    
    words_embeddings = words_embeddings > cutoff
    words_embeddings = words_embeddings.astype(int)
    
    word_similarity[i] = words_embeddings
    
pickle.dump(word_similarity,open('word_similarity_amazon_musical5_cutoff.pkl','wb'))
'''

In [None]:
word_embeddings = []

for word in tqdm(sampler.vectorizer.get_feature_names()):
    emb = embeddings_index.get(word,np.array([0]*embedding_dim))
    word_embeddings.append(emb)
    
word_embeddings = np.array(word_embeddings)

In [None]:
word_similarity = cosine_similarity(word_embeddings)

In [None]:
word_similarity

In [None]:
cutoff = .5

word_similarity = word_similarity > cutoff
word_similarity = word_similarity.astype(int)

In [None]:
sampler.run(reviews=list(dataset.clean_sentence), labels=list(dataset.sentiment_score), unlabeled_reviews=[], similar_words = word_similarity, mrf = True, maxIters=maxiter)


In [None]:
sampler.run(reviews=list(dataset.clean_sentence), labels=list(dataset.sentiment_score), unlabeled_reviews=[], similar_words = word_similarity, mrf = False, maxIters=maxiter)


In [None]:
sampler.conditionalDistribution(0,0,word_similarity,True,True)

In [None]:
sampler.n_dt

In [None]:
sampler.dt_distribution

In [None]:
sampler.n_dts

In [None]:
sampler.dts_distribution

In [None]:
lda2.fit(sampler.wordOccuranceMatrix)

In [None]:
dt = lda2.transform(sampler.wordOccuranceMatrix)

In [None]:
dt

In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [None]:
Counter(sampler.dt_distribution.argmax(axis=1))

In [None]:
Counter(dt.argmax(axis=1))

In [None]:
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances

In [None]:
cosine_doc = cosine_distances(sampler.wordOccuranceMatrix)
cosine_doc.shape

In [None]:
cosine_doc

In [None]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),sampler.dt_distribution.argmax(axis=1),metric='precomputed')

In [None]:
silhouette_score(euclidean_distances(sampler.wordOccuranceMatrix),dt.argmax(axis=1),metric='precomputed')

In [None]:
davies_bouldin_score(sampler.wordOccuranceMatrix,sampler.dt_distribution.argmax(axis=1))

In [None]:
davies_bouldin_score(sampler.wordOccuranceMatrix,dt.argmax(axis=1))

In [None]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, list(sampler.getTopKWords(5).values()), sampler.vocabulary)

In [None]:
my_utils.get_hscore(sampler.dt_distribution[:1000],sampler.wordOccuranceMatrix[:1000],n_topics)

In [None]:
def print_top_words(model, feature_names, n_top_words):
    output = []
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        output.append([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        #print(message)
    return output

In [None]:
tf_feature_names = sampler.vectorizer.get_feature_names()

In [None]:
my_utils.coherence_score(sampler.wordOccuranceMatrix, print_top_words(lda2, tf_feature_names, 5), sampler.vocabulary)

In [None]:
my_utils.get_hscore(dt[:1000],sampler.wordOccuranceMatrix[:1000],n_topics)

In [None]:
#count_matrix, _, vocabulary, words = my_utils.processReviews(dataset['clean_sentence'].values)