In [37]:
def softmax(x):
    """
    Compute the softmax function for each row of the input x.

    Arguments:
    x -- A N dimensional vector or M x N dimensional numpy matrix.

    Return:
    x -- You are allowed to modify x in-place
    """
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        exp_minmax = lambda x: np.exp(x - np.max(x))
        denom = lambda x: 1.0 / np.sum(x)
        x = np.apply_along_axis(exp_minmax,1,x)
        denominator = np.apply_along_axis(denom,1,x) 
        
        if len(denominator.shape) == 1:
            denominator = denominator.reshape((denominator.shape[0],1))
        
        x = x * denominator
    else:
        # Vector
        x_max = np.max(x)
        x = x - x_max
        numerator = np.exp(x)
        denominator =  1.0 / np.sum(numerator)
        x = numerator.dot(denominator)
    
    assert x.shape == orig_shape
    return x

In [52]:
import pandas as pd
import json
import numpy  as np

def data_format(file):
    beta = np.load(f"{file}/beta.npz",allow_pickle=True)
    beta = softmax(beta["beta"])
    # print("beta",beta,beta.shape, sum(beta[0]))

    topic_term_dists = beta.tolist()
    # print("topic_term_dists", topic_term_dists[0][0])


    theta = np.load(f"{file}/theta.train.npz",allow_pickle=True)
    # print("theta", theta.files, theta["theta"].shape, sum(theta["theta"][0]))
    doc_topic_dists = theta["theta"].tolist()
    # print("doc_topic_dists",doc_topic_dists)

    with open(f"{file}/doc_lengths.json", 'r') as j:
        doc_lengths = json.load(j)
        doc_lengths = doc_lengths["doc_lengths"]
    #     print("doc_lengths",len(doc_lengths))


    with open(f"{file}/vocab.json", 'r') as j:
        vocab = json.load(j)
    #     print("vocab", len(vocab))

    with open(f"{file}/term_frequency.json", 'r') as j:
        term_frequency = json.load(j)
        term_frequency = term_frequency["term_frequency"]
    #     print("term_frequency",len(term_frequency))



    data = {'topic_term_dists': topic_term_dists, 
            'doc_topic_dists': doc_topic_dists,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency}
    return data
topic10_data = data_format("./output10/")
topic20_data = data_format("./output20/")
topic50_data = data_format("./output50/")

In [48]:
import pyLDAvis
movies_vis_data = pyLDAvis.prepare(**topic10_data)
pyLDAvis.display(movies_vis_data)

In [53]:
movies_vis_data = pyLDAvis.prepare(**topic20_data)
pyLDAvis.display(movies_vis_data)

In [50]:
movies_vis_data = pyLDAvis.prepare(**topic50_data)
pyLDAvis.display(movies_vis_data)