In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os, glob
import string
import copy
import copy
import pickle
import datetime
import joblib, multiprocessing
import utils as my_utils

from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
min_df = 5
max_df = .5
max_features = 50000

n_cores = 40
max_iter = 20
n_top_words = 5

In [3]:
datasets = glob.glob("datasets/*")

In [4]:
datasets = ['datasets/amazon_home_20000_dataset',
            'datasets/amazon_movies_20000_dataset',
            'datasets/amazon_kindle_20000_dataset']

In [5]:
for dataset in datasets:
    print(dataset)

    dataset = pd.read_pickle(dataset)
    n_topics = 5 # * dataset.sentiment.unique().shape[0]
    print(n_topics)

    vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                                 stop_words="english", max_features=max_features,
                                 max_df=max_df, min_df=min_df)

    count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
    words = vectorizer.get_feature_names()

    vocabulary = dict(zip(words,np.arange(len(words))))

    model = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, n_jobs=n_cores, verbose=0)

    dt_distribution = model.fit_transform(count_matrix)

    topic_words = {}
    for topic, comp in enumerate(model.components_):
        word_idx = np.argsort(comp)[::-1][:n_top_words]
        topic_words[topic] = [words[i] for i in word_idx]

    sample_df = []
    for topic, word in topic_words.items():
        sample_df.append(', '.join(word).split(", "))


    print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
          silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
          davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
          my_utils.coherence_score(count_matrix, sample_df, vocabulary),
          model.score(count_matrix),
          model.perplexity(count_matrix),
          my_utils.coherence_score2(count_matrix, sample_df, vocabulary))

datasets/amazon_home_20000_dataset
5
0.2157596405788876 -0.020556973228071054 12.192311123891825 -17.807297364916728 -2112241.42092325 1193.2323401602293 0.23750912107115826
datasets/amazon_movies_20000_dataset
5
0.2566076086057828 -0.03150088003006012 13.465772068442675 -17.335867987395055 -2185895.3925461858 1765.046905293361 0.21519446877325574
datasets/amazon_kindle_20000_dataset
5
0.2332396807207907 -0.023920764673080814 11.683450762105448 -17.1253583415197 -1953577.6085018432 1215.3122360418997 0.2454554416582656


In [6]:
dataset = pd.read_pickle("datasets/amazon_electronics_20000_dataset")

In [7]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [8]:
count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [9]:
vocabulary = dict(zip(words,np.arange(len(words))))

In [10]:
model = LatentDirichletAllocation(n_components=n_topics, max_iter=20, n_jobs=n_cores, verbose=0)

In [11]:
model.fit(count_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=20,
                          mean_change_tol=0.001, n_components=5, n_jobs=40,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [12]:
topic_words = {}
for topic, comp in enumerate(model.components_):
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [words[i] for i in word_idx]

sample_df = []
for topic, word in topic_words.items():
    sample_df.append(', '.join(word).split(", "))

dt_distribution = model.transform(count_matrix)

In [13]:
print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
      silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
      davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
      my_utils.coherence_score(count_matrix, sample_df, vocabulary),
      model.score(count_matrix),
      model.perplexity(count_matrix))

0.23908709683394405 -0.00920992794407115 9.603978826006756 -20.158525241452985 -2117031.303965419 1085.6390075371771


# Appendix

In [14]:
# print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000))
# print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
# print("Log Likelihood:", model.score(count_matrix))
# print("Perplexity:", model.perplexity(count_matrix))

In [15]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, k, 3000))
#     print("Log Likelihood:", model.score(count_matrix))
#     print("Perplexity:", model.perplexity(count_matrix))
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
    
# #     print my_utils.coherence_score(count_matrix, sample_df, vocabulary), "\t", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)), "\t", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1))

In [16]:
# def get_evaluations_multi(model):
#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     h_score =  my_utils.get_hscore_multi(dt_distribution, count_matrix, k)
#     likelihood =  model.score(count_matrix)
#     perplexity = model.perplexity(count_matrix)
#     coherance_score = my_utils.coherence_score(count_matrix, sample_df, vocabulary)
#     silhouette = silhouette_score(count_matrix, dt_distribution.argmax(axis=1))
#     return [h_score, likelihood, perplexity, coherance_score, silhouette]

In [17]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [18]:
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [19]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

In [20]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))