In [31]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    Arguments:
    x -- A N dimensional vector or M x N dimensional numpy matrix.

    Return:
    x -- You are allowed to modify x in-place
    """
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x) 
        
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        
        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
    
    assert x.shape == orig_shape
    return x

In [28]:
import pandas as pd
import json
import numpy  as np

topics_to_labels = np.load("./output/topics_to_labels.npz",allow_pickle=True) #['probs', 'label']
# print(topics_to_labels.files, topics_to_labels["probs"].shape)



beta = np.load("./output/beta.npz",allow_pickle=True)
beta = softmax(beta["beta"])
print("beta",beta,beta.shape, sum(beta[0]))

topic_term_dists = beta.tolist()
print("topic_term_dists", topic_term_dists[0][0])


theta = np.load("./output/theta.train.npz",allow_pickle=True)
print("theta", theta.files, theta["theta"].shape, sum(theta["theta"][0]))
doc_topic_dists = theta["theta"].tolist()
print("doc_topic_dists",doc_topic_dists)




with open("./output/doc_lengths.json", 'r') as j:
    doc_lengths = json.load(j)
    doc_lengths = doc_lengths["doc_lengths"]
    print("doc_lengths",len(doc_lengths))


with open("./output/vocab.json", 'r') as j:
    vocab = json.load(j)
    print("vocab", len(vocab))
    
with open("./output/term_frequency.json", 'r') as j:
    term_frequency = json.load(j)
    term_frequency = term_frequency["term_frequency"]
    print("term_frequency",len(term_frequency))
    

#vocab = np.load("./output/vocab.npz",allow_pickle=True)
#print(theta.files, theta["theta"].shape, theta["ids"])

data = {'topic_term_dists': topic_term_dists, 
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency}


beta [[2.22097357e-04 2.55454832e-04 1.38177803e-04 ... 1.50896682e-04
  3.41899261e-05 4.58155789e-05]
 [9.13303441e-05 8.18080172e-05 1.44014804e-04 ... 1.14352915e-03
  1.76270900e-04 4.00731794e-05]
 [1.04876049e-04 9.44790436e-06 6.43347773e-05 ... 5.19751017e-04
  1.25429914e-04 9.40967872e-06]
 ...
 [7.17815544e-04 3.22260171e-04 3.05933059e-04 ... 1.28534632e-04
  2.10773658e-03 1.63505586e-03]
 [1.81852153e-04 3.49413084e-05 7.42962860e-05 ... 2.25242821e-03
  2.74141235e-04 1.10094753e-04]
 [5.60056785e-04 6.23332727e-04 1.74375087e-04 ... 1.83867217e-04
  1.62618627e-04 1.99188819e-05]] (50, 2000) 1.0000000142792493
topic_term_dists 0.0002220973571377929
theta ['theta', 'ids'] (31998, 50) 1.0000000565778464
doc_topic_dists 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [32]:
import pyLDAvis
movies_vis_data = pyLDAvis.prepare(**data)
pyLDAvis.display(movies_vis_data)