In [1]:
import numpy as np
import pandas as pd
import scipy
import operator
import nltk
import os
import string
import copy
import copy
import pickle

import utils as my_utils

from scipy import spatial
from collections import Counter
from scipy.special import gammaln
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import silhouette_score, davies_bouldin_score




In [2]:
dataset = pd.read_pickle("resources/data_amazon_QA_Electronics_5kanswers_noduplicates_pd")

In [3]:
dataset[6] = my_utils.preprocess(dataset['answerText'])
dataset[7] = dataset[6].apply(lambda x: " ".join(x))

In [4]:
dataset.head()

Unnamed: 0,askerID,questionText,answererID,helpful,answerText,6,7
0,AMD370KY9I1WK,Good for rabbit ears? I live in a basement apa...,A1GNC9LPUQ8HTG,"[6, 6]",I purchased this Motorola signal booster for m...,"[purchased, motorola, signal, booster, rabbit,...",purchased motorola signal booster rabbit ear c...
1,AMD370KY9I1WK,Good for rabbit ears? I live in a basement apa...,A3NYJZ71CESSP8,"[3, 3]",When an amplifier receives a bad signal all yo...,"[amplifier, receives, bad, signal, get, amplif...",amplifier receives bad signal get amplified ba...
2,AMD370KY9I1WK,Good for rabbit ears? I live in a basement apa...,A3UD50M7M72150,"[1, 1]","hey, i'm in the same prob here, looking for so...","[hey, prob, looking, something, boost, weak, s...",hey prob looking something boost weak signal r...
3,AMD370KY9I1WK,Good for rabbit ears? I live in a basement apa...,ACNON4BZACUUL,"[2, 2]",This motorola signal booster did not work at a...,"[motorola, signal, booster, work, wa, waste, m...",motorola signal booster work wa waste money re...
4,AMD370KY9I1WK,Good for rabbit ears? I live in a basement apa...,A2AMFW65LLALFI,"[1, 1]",I emailed Motorola TechSupport about this prod...,"[emailed, motorola, techsupport, product, repl...",emailed motorola techsupport product reply got...


In [5]:
dataset.shape

(5006, 7)

In [6]:
count_matrix, tfidf_matrix, vocabulary, words = my_utils.processReviews(dataset[7].values)

In [7]:
n_top_words = 5

In [8]:
topics_grid = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60]

In [9]:
models_dump = []

for k in topics_grid:
    print("\nK:", k)

    print("Running Sampler...")
    model = LatentDirichletAllocation(n_components=k, random_state=100)
    model.fit(count_matrix)
    models_dump.append(copy.deepcopy(model))


K: 5
Running Sampler...

K: 10
Running Sampler...

K: 15
Running Sampler...

K: 20
Running Sampler...

K: 25
Running Sampler...

K: 30
Running Sampler...

K: 35
Running Sampler...

K: 40
Running Sampler...

K: 45
Running Sampler...

K: 50
Running Sampler...

K: 60
Running Sampler...


In [10]:
for k, model in zip(topics_grid, models_dump):

    topic_words = {}
    for topic, comp in enumerate(model.components_):
        word_idx = np.argsort(comp)[::-1][:n_top_words]
        topic_words[topic] = [words[i] for i in word_idx]

    sample_df = []
    for topic, word in topic_words.items():
        sample_df.append(', '.join(word).split(", "))

    dt_distribution = model.transform(count_matrix)

    print("\nK:", k)
    print("Running Metrics...")
    print("Coherance Score:", my_utils.coherence_score(count_matrix, sample_df, vocabulary))
    print("Silhouette Score:", silhouette_score(count_matrix, dt_distribution.argmax(axis=1)))
    print("Davies Bouldin Score:", davies_bouldin_score(count_matrix, dt_distribution.argmax(axis=1)))


K: 5
Running Metrics...
Coherance Score: -20.34627476995326
Silhouette Score: -0.012382507985640275
Davies Bouldin Score: 13.558338644770268

K: 10
Running Metrics...
Coherance Score: -20.723920528719564
Silhouette Score: -0.02436223455032383
Davies Bouldin Score: 11.34931241451998

K: 15
Running Metrics...
Coherance Score: -22.116107282829603
Silhouette Score: -0.03053084539860494
Davies Bouldin Score: 10.993603321453707

K: 20
Running Metrics...
Coherance Score: -22.33979993018161
Silhouette Score: -0.048350412647433644
Davies Bouldin Score: 10.420580958024985

K: 25
Running Metrics...
Coherance Score: -22.732478311602513
Silhouette Score: -0.055207712674092614
Davies Bouldin Score: 10.02381433795611

K: 30
Running Metrics...
Coherance Score: -23.43778915174925
Silhouette Score: -0.05257189706452568
Davies Bouldin Score: 9.379022271815437

K: 35
Running Metrics...
Coherance Score: -24.158014181735535
Silhouette Score: -0.05051810528289234
Davies Bouldin Score: 8.957463286998427

K: 

# Appendix

In [11]:
#print("H-Score:", my_utils.get_hscore(dt_distribution, count_matrix, k))

# TSNE

In [12]:
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# X_embedded = TSNE(n_components=2).fit_transform(dt_distribution)

# X_embedded.shape

# plt.figure(figsize=(10, 10))
# plt.scatter([i[0] for i in X_embedded], [i[1] for i in X_embedded], c=dt_distribution.argmax(axis=1))