In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os
import string
import copy
import copy
import pickle

import utils as my_utils

import multiprocessing
from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
dataset = pd.read_pickle("datasets/datadf_amazon_musical")

In [3]:
dataset.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,sentiment,summary,unixReviewTime,reviewTime,cleaned,text
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014","[much, write, doe, exactly, supposed, filter, ...",much write doe exactly supposed filter pop sou...
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013","[product, doe, exactly, quite, affordable, rea...",product doe exactly quite affordable realized ...


In [4]:
dataset.shape

(10254, 11)

In [5]:
count_matrix, tfidf_matrix, vocabulary, words = my_utils.processReviews(dataset['text'].values)

In [6]:
n_top_words = 5

In [7]:
topics_grid = [40, 45, 50, 60]

In [8]:
def fit_model_multi(k):
    model = LatentDirichletAllocation(n_components=k, random_state=100)
    model.fit(count_matrix)
    print("done:", k)
    return model

In [9]:
pool = multiprocessing.Pool()
models_dump = pool.map(fit_model_multi, topics_grid)
pool.close()

done: 5
done: 10
done: 40
done: 45
done: 60
done: 50


In [10]:
for k, model in zip(topics_grid, models_dump):

    topic_words = {}
    for topic, comp in enumerate(model.components_):
        word_idx = np.argsort(comp)[::-1][:n_top_words]
        topic_words[topic] = [words[i] for i in word_idx]

    sample_df = []
    for topic, word in topic_words.items():
        sample_df.append(', '.join(word).split(", "))

    dt_distribution = model.transform(count_matrix)

    print("\nK:", k)
    print("Running Metrics...")
    print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, k))
#     print("Log Likelihood:", model.score(count_matrix))
#     print("Perplexity:", model.perplexity(count_matrix))
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))


K: 5
Running Metrics...
Coherance Score: -14.152827361263855

K: 10
Running Metrics...
Coherance Score: -13.841778117894894

K: 40
Running Metrics...
Coherance Score: -17.303930307969264

K: 45
Running Metrics...
Coherance Score: -17.257824214715814

K: 50
Running Metrics...
Coherance Score: -17.5610545865298

K: 60
Running Metrics...
Coherance Score: -17.748357701429146


# Appendix

In [11]:
# def get_evaluations_multi(model):
#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     h_score =  my_utils.get_hscore_multi(dt_distribution, count_matrix, k)
#     likelihood =  model.score(count_matrix)
#     perplexity = model.perplexity(count_matrix)
#     coherance_score = my_utils.coherence_score(count_matrix, sample_df, vocabulary)
#     silhouette = silhouette_score(count_matrix, dt_distribution.argmax(axis=1))
#     return [h_score, likelihood, perplexity, coherance_score, silhouette]

In [12]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [13]:
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [14]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

In [15]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))