In [30]:
import io
import pandas as pd
import gensim

In [2]:
from stability import *
from stablelda import StableLDA

#### train a topic model of 25 topics on the stackexchange dataset using Stable LDA 

In [10]:
bow_file = 'data/stackexchange.bow'
vocab_file = 'data/stackexchange.vocab'

num_topics = 25
num_words = 5000
alpha, beta, eta = 1, 0.01, 1000
epochs = 5
rand_seed = 42
output_dir = 'data/output/'

In [11]:
stablelda = StableLDA(num_topics, num_words, alpha, beta, eta, rand_seed, output_dir)
stablelda.train(bow_file, vocab_file, epochs)

docs, vocab, theta, phi = load_topic_model_results(bow_file, vocab_file,
                                                     output_dir+'theta.dat', output_dir+'phi.dat')
tm = TopicModel(num_topics, theta, phi, docs, vocab)

tm.print_top_n_words(10)

--------running Stable LDA model----------
--------- loading data ----------------
./train -f data/stackexchange.bow -v data/stackexchange.vocab -c data/output/cluster.dat -z data/output/z.dat -t 25 -w 5000 -a 1 -b 0.01 -e 1000 -n 2 -r 42 -o data/output/
game player plai server world team mode friend singl onlin
alt look map imag appear img screen time enter red
build citi ship space resourc unit train engin research explor
new account updat avail origin access purchas free old releas
strong differ chang charact base gener follow exampl requir allow
place right area left awai wall jump room head door
item armor equip chest inventori enchant slot potion gear craft
set save file option steam open kbd click creat download
control connect xbox support drive devic plai internet machin port
need want start try run time abl second end turn
block spawn water mob farm villag tree stone zombi plant
power point increas number high health stat mean give valu
bui gold upgrad store trade cost monei 

#### compute model perplexity

In [12]:
compute_perlexity(docs, theta, phi)

compute likelihood
likelihood: -15041245.50530322
perplexity: 846.1708574906029


846.1708574906029

#### compute model coherence.
we use Gensim's coherence method. We need to prepare gensim_bow, and id2word

In [17]:
topics = tm.get_top_n_words(10)

In [31]:
#### read in raw text data -- used for windows-based topic coherence measure
with io.open(bow_file, 'r', encoding='utf-8') as f:
    texts = [line.split() for line in f.read().splitlines()]

In [32]:
#### prepare gensim_bow and id2word
id2word = gensim.corpora.Dictionary(texts)
gensim_bow = [id2word.doc2bow(text) for text in texts]

In [33]:
print('topic coherence c_uci', compute_coherence(gensim_bow, texts, id2word, topics, coherence_score='c_uci') )

('topic coherence c_uci', 0.5956598849278567)


In [34]:
print('topic coherence c_v', compute_coherence(gensim_bow, texts, id2word, topics, coherence_score='c_v') )

('topic coherence c_v', 0.5854098401649387)
