In [31]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    Arguments:
    x -- A N dimensional vector or M x N dimensional numpy matrix.

    Return:
    x -- You are allowed to modify x in-place
    """
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x) 
        
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        
        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
    
    assert x.shape == orig_shape
    return x

In [35]:
import pandas as pd
import json
import numpy  as np

topics_to_labels = np.load("./output10/topics_to_labels.npz",allow_pickle=True) #['probs', 'label']
# print(topics_to_labels.files, topics_to_labels["probs"].shape)



beta = np.load("./output10/beta.npz",allow_pickle=True)
beta = softmax(beta["beta"])
print("beta",beta,beta.shape, sum(beta[0]))

topic_term_dists = beta.tolist()
print("topic_term_dists", topic_term_dists[0][0])


theta = np.load("./output10/theta.train.npz",allow_pickle=True)
print("theta", theta.files, theta["theta"].shape, sum(theta["theta"][0]))
doc_topic_dists = theta["theta"].tolist()
print("doc_topic_dists",doc_topic_dists)




with open("./output10/doc_lengths.json", 'r') as j:
    doc_lengths = json.load(j)
    doc_lengths = doc_lengths["doc_lengths"]
    print("doc_lengths",len(doc_lengths))


with open("./output10/vocab.json", 'r') as j:
    vocab = json.load(j)
    print("vocab", len(vocab))
    
with open("./output10/term_frequency.json", 'r') as j:
    term_frequency = json.load(j)
    term_frequency = term_frequency["term_frequency"]
    print("term_frequency",len(term_frequency))
    

#vocab = np.load("./output/vocab.npz",allow_pickle=True)
#print(theta.files, theta["theta"].shape, theta["ids"])

data = {'topic_term_dists': topic_term_dists, 
        'doc_topic_dists': doc_topic_dists,
        'doc_lengths': doc_lengths,
        'vocab': vocab,
        'term_frequency': term_frequency}


beta [[9.05907745e-06 1.56537806e-04 9.70548628e-05 ... 1.20407188e-05
  3.10495870e-05 1.47940339e-05]
 [4.69702925e-05 5.01911710e-04 4.03543992e-04 ... 3.78587397e-05
  2.77031596e-04 8.41783755e-03]
 [4.06075088e-04 3.01978247e-04 4.13178675e-04 ... 2.23017147e-05
  4.28159409e-05 2.42227126e-06]
 ...
 [1.65537596e-04 1.88635739e-04 6.78751986e-04 ... 1.62625012e-05
  2.40104755e-03 4.98769748e-03]
 [1.68502292e-04 1.15715026e-04 1.01662293e-04 ... 2.55043404e-05
  1.24023455e-05 2.42250302e-06]
 [5.15084230e-04 2.60610886e-04 1.62894698e-04 ... 3.19240607e-05
  2.04781768e-05 1.87799075e-06]] (10, 2000) 1.0000000539943708
topic_term_dists 9.059077449511052e-06
theta ['theta', 'ids'] (31998, 10) 1.0000000493600965
doc_topic_dists 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [36]:
import pyLDAvis
movies_vis_data = pyLDAvis.prepare(**data)
pyLDAvis.display(movies_vis_data)