In [53]:
import matplotlib.pyplot as plt
from bokeh.io import show, output_file
from bokeh.plotting import figure
import numpy as np
from gensim.models import CoherenceModel

from pipeline.lda import LDABuilder
from pipeline.paths import Paths
from pipeline.utils import read_doc_by_line

In [18]:
paths = Paths()
text_corpus = [doc.split() for doc in read_doc_by_line(paths.trigram_corpus_filepath)]

In [4]:
lda_builder = LDABuilder('sdfg')
lda = lda_builder.get_model(n_topics=100, from_scratch=False)
dictionary = lda_builder.get_corpus_dict()
bow_corpus = lda_builder.get_trigram_bow_corpus(dictionary)

Loading LDA model (n_topics=100)...
Loading trigram dict...
Loading bow corpus...


In [22]:
cm_umass = CoherenceModel(model=lda, corpus=bow_corpus, dictionary=dictionary, coherence='u_mass')

CPU times: user 136 ms, sys: 11.5 ms, total: 148 ms
Wall time: 147 ms


In [40]:
cm_cv    = CoherenceModel(model=lda, texts=text_corpus, dictionary=dictionary, coherence='c_v', processes=6)

In [32]:
cm_uci   = CoherenceModel(model=lda, texts=text_corpus, dictionary=dictionary, coherence='c_uci', processes=6)

CPU times: user 161 ms, sys: 11.6 ms, total: 173 ms
Wall time: 173 ms


In [33]:
cm_npmi  = CoherenceModel(model=lda, texts=text_corpus, dictionary=dictionary, coherence='c_npmi', processes=6)

CPU times: user 138 ms, sys: 7.69 ms, total: 145 ms
Wall time: 144 ms


In [None]:
c_models = [cm_umass, cm_cv, cm_uci, cm_npmi]
names = [m.coherence for m in c_models]
coherences = [m.get_coherence() for m in c_models]

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))


In [52]:
p = figure(x_range=names, plot_height=250, title="Coherences for LDA with 100 topics")
p.vbar(x=names, top=[coherences], width=0.8)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

ValueError: Out of range float values are not JSON compliant

## Compare num topics

In [54]:
topic_vals = [50, 75, 100, 200, 1000]

In [None]:
coherences = []

for n_topics in topic_vals:
    lda = lda_builder.get_model(n_topics=n_topics, from_scratch=False)
    cm = CoherenceModel(model=lda, texts=text_corpus, dictionary=dictionary, coherence='c_npmi', processes=6)
    coherence = (np.array(cm.get_coherence()), np.array(cm.get_coherence_per_topic()).mean())
    coherences.append(coherence)

In [55]:
plt.semilogx(topic_vals, coherences)
plt.show()

[-1.0864560023754493,
 -1.7560081378030006,
 -1.8568036121255773,
 -3.617197021766814,
 -2.3150805339318494,
 -2.65126129641372,
 -1.482100307505131,
 -1.5787855030377604,
 -1.3134906877557917,
 -1.4887938725689225,
 -1.4764812924201507,
 -1.9224592800258278,
 -1.789395091633088,
 -1.7549402765687916,
 -1.97464792560929,
 -1.5070318629542552,
 -1.320643827576097,
 -1.867985238707897,
 -1.3610433873694774,
 -1.6383246653495827,
 -2.096778245099747,
 -1.4715172940698698,
 -2.064014696404994,
 -1.3446011736749146,
 -1.1550954992419944,
 -1.1893968147906184,
 -1.2024748840399309,
 -2.955253897206931,
 -2.67701130919099,
 -1.6904824862717414,
 -1.4090338666161317,
 -1.0664415218546486,
 -1.2505672338618743,
 -1.1537889004966753,
 -2.893154094227949,
 -1.4585709446762471,
 -1.9002401599613754,
 -2.02662447879079,
 -1.9743209909094366,
 -1.6621757332946803,
 -1.1927798324757182,
 -2.151373835172457,
 -2.6632124024580897,
 -1.4332641240897095,
 -1.5983224784080778,
 -1.4231186368885411,
 -2.48