In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os, glob
import string
import copy
import copy
import pickle
import datetime
import joblib, multiprocessing
import utils as my_utils

from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
min_df = 5
max_df = .5
max_features = 50000

n_cores = 10
max_iter = 20
n_top_words = 5

In [3]:
datasets = glob.glob("datasets/*")

In [4]:
for dataset in datasets:
    print(dataset)

    dataset = pd.read_pickle(dataset)
    n_topics = 5 * dataset.sentiment.unique().shape[0]
    print(n_topics)

    vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                                 stop_words="english", max_features=max_features,
                                 max_df=max_df, min_df=min_df)

    count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
    words = vectorizer.get_feature_names()

    vocabulary = dict(zip(words,np.arange(len(words))))

    model = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter, n_jobs=n_cores, verbose=0)

    dt_distribution = model.fit_transform(count_matrix)

    topic_words = {}
    for topic, comp in enumerate(model.components_):
        word_idx = np.argsort(comp)[::-1][:n_top_words]
        topic_words[topic] = [words[i] for i in word_idx]

    sample_df = []
    for topic, word in topic_words.items():
        sample_df.append(', '.join(word).split(", "))


    print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
          silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
          davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
          my_utils.coherence_score(count_matrix, sample_df, vocabulary),
          model.score(count_matrix),
          model.perplexity(count_matrix))

datasets/amazon_home_20000_dataset
25
0.4271302085198412 -0.07371590027560043 9.242989748366956 -25.857186877536762 -2174235.587381761 1469.0166597318778
datasets/amazon_movies_20000_dataset
25
0.4409421370210823 -0.06882561030861212 11.951344837048973 -23.532061086580203 -2260069.8443661346 2274.726145103683
datasets/amazon_electronics_20000_dataset
25
0.45916268840771896 -0.06120290744754176 7.5281213350236795 -22.533250912797488 -2170470.98212741 1295.1332948017164
datasets/amazon_kindle_20000_dataset
25
0.4510683336048458 -0.055130329595544786 10.892115939728226 -21.32837119049423 -2032921.3300296064 1621.7023759542858
datasets/twitter_airline_9061_dataset
15
0.38436392190535473 -0.00814053267267209 7.196767405583188 -24.77351297973521 -368986.1031729426 517.7264774213326
datasets/imdb_reviews_20000_dataset
10
0.4173425952489424 -0.011608944779453512 14.666484476589588 -16.268223616906205 -4046438.448057927 2534.3618591214254


In [None]:
dataset = pd.read_pickle("datasets/amazon_electronics_20000_dataset")

In [None]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [None]:
count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [None]:
vocabulary = dict(zip(words,np.arange(len(words))))

In [None]:
model = LatentDirichletAllocation(n_components=n_topics, max_iter=20, n_jobs=n_cores, verbose=0)

In [None]:
model.fit(count_matrix)

In [None]:
topic_words = {}
for topic, comp in enumerate(model.components_):
    word_idx = np.argsort(comp)[::-1][:n_top_words]
    topic_words[topic] = [words[i] for i in word_idx]

sample_df = []
for topic, word in topic_words.items():
    sample_df.append(', '.join(word).split(", "))

dt_distribution = model.transform(count_matrix)

In [None]:
print(my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000), 
      silhouette_score(count_matrix, dt_distribution.argmax(axis=1)),
      davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)),
      my_utils.coherence_score(count_matrix, sample_df, vocabulary),
      model.score(count_matrix),
      model.perplexity(count_matrix))

# Appendix

In [None]:
# print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, n_topics, 2000))
# print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
# print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
# print("Log Likelihood:", model.score(count_matrix))
# print("Perplexity:", model.perplexity(count_matrix))

In [None]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, k, 3000))
#     print("Log Likelihood:", model.score(count_matrix))
#     print("Perplexity:", model.perplexity(count_matrix))
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
    
# #     print my_utils.coherence_score(count_matrix, sample_df, vocabulary), "\t", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)), "\t", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1))

In [None]:
# def get_evaluations_multi(model):
#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     h_score =  my_utils.get_hscore_multi(dt_distribution, count_matrix, k)
#     likelihood =  model.score(count_matrix)
#     perplexity = model.perplexity(count_matrix)
#     coherance_score = my_utils.coherence_score(count_matrix, sample_df, vocabulary)
#     silhouette = silhouette_score(count_matrix, dt_distribution.argmax(axis=1))
#     return [h_score, likelihood, perplexity, coherance_score, silhouette]

In [None]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))