In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os
import string
import copy
import copy
import pickle
import datetime
import joblib

import utils as my_utils

from sklearn.feature_extraction.text import CountVectorizer

import multiprocessing
from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [2]:
dataset = pd.read_pickle("resources/electronics_glove_random_0.35_dataset")

In [3]:
cutoff = .35
title = "electronics_glove_random_50k"

min_df = 5
max_df = .5
max_features = 50000

n_cores = 30
n_docs = 50000

In [4]:
vectorizer = CountVectorizer(analyzer="word",tokenizer=None,preprocessor=None,
                             stop_words="english", max_features=max_features,
                             max_df=max_df, min_df=min_df)

In [5]:
count_matrix = vectorizer.fit_transform(dataset.text.tolist()).toarray()
words = vectorizer.get_feature_names()

In [6]:
vocabulary = dict(zip(words,np.arange(len(words))))

In [7]:
topics_grid = [5, 10, 15, 25, 40, 60]

In [13]:
def fit_model_multi(k):
    model = LatentDirichletAllocation(n_components=k, random_state=100, max_iter=10, n_jobs=n_cores, verbose=0)
    model.fit(count_matrix)
#     joblib.dump(model, "dumps/" + folder_name + "/sampler_n_topics_" + str(k))
    print("done:", k)
    return model

In [14]:
models_dump = []
for k in topics_grid:
    models_dump.append(fit_model_multi(k))

done: 5
done: 10
done: 15
done: 25
done: 40
done: 60


In [16]:
n_top_words = 5

In [None]:
for k, model in zip(topics_grid, models_dump):

    topic_words = {}
    for topic, comp in enumerate(model.components_):
        word_idx = np.argsort(comp)[::-1][:n_top_words]
        topic_words[topic] = [words[i] for i in word_idx]

    sample_df = []
    for topic, word in topic_words.items():
        sample_df.append(', '.join(word).split(", "))

    dt_distribution = model.transform(count_matrix)

    print("\nK:", k)
    print("Running Metrics...")
    print("H Score:", my_utils.get_hscore_multi(dt_distribution, count_matrix, k, 3000))
    print("Log Likelihood:", model.score(count_matrix))
    print("Perplexity:", model.perplexity(count_matrix))
    print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
    print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
    print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))
    
#     print my_utils.coherence_score(count_matrix, sample_df, vocabulary), "\t", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)), "\t", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1))


K: 5
Running Metrics...
H Score: 0.20297895852514314
Log Likelihood: -3220913.4961886797
Perplexity: 1144.0985755591178
Coherance Score: -14.933474054956928
Silhouette Score: -0.025005600618462548
Davies Bouldin Score: 10.985223129933836

K: 10
Running Metrics...
H Score: 0.30427975716286004
Log Likelihood: -2530422.08926428
Perplexity: 1591.1438919527286
Coherance Score: -15.265987629954054
Silhouette Score: -0.0738346403400513
Davies Bouldin Score: 11.111119260694156

K: 15
Running Metrics...
H Score: 0.35168474161661106
Log Likelihood: -2547742.980402375
Perplexity: 1673.498584894309
Coherance Score: -16.576713005906534
Silhouette Score: -0.11371370324662419
Davies Bouldin Score: 10.537380277916155

K: 25
Running Metrics...
H Score: 0.4153811644465827
Log Likelihood: -2577468.086238364
Perplexity: 1824.8876454664792
Coherance Score: -16.02266155326091
Silhouette Score: -0.11607971782311524
Davies Bouldin Score: 10.294885543725893

K: 40
Running Metrics...
H Score: 0.456550922509334

# Appendix

In [None]:
# def get_evaluations_multi(model):
#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     h_score =  my_utils.get_hscore_multi(dt_distribution, count_matrix, k)
#     likelihood =  model.score(count_matrix)
#     perplexity = model.perplexity(count_matrix)
#     coherance_score = my_utils.coherence_score(count_matrix, sample_df, vocabulary)
#     silhouette = silhouette_score(count_matrix, dt_distribution.argmax(axis=1))
#     return [h_score, likelihood, perplexity, coherance_score, silhouette]

In [None]:
# for k, model in zip(topics_grid, models_dump):

#     topic_words = {}
#     for topic, comp in enumerate(model.components_):
#         word_idx = np.argsort(comp)[::-1][:n_top_words]
#         topic_words[topic] = [words[i] for i in word_idx]

#     sample_df = []
#     for topic, word in topic_words.items():
#         sample_df.append(', '.join(word).split(", "))

#     dt_distribution = model.transform(count_matrix)

#     print("\nK:", k)
#     print("Running Metrics...")
#     print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
#     print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
#     print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
# print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))

In [None]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

In [None]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))