In [75]:
def compute_hellinger(model, NUMTOPICS):
    import itertools
    from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full

    distlst = []
    topics = [i for i in range(NUMTOPICS)]
    combinations = itertools.combinations(topics, 2)
    for c in combinations:
        dist1 = make_topics_bow(model, model.print_topic(c[0], topn=30))
        dist2 = make_topics_bow(model, model.print_topic(c[1], topn=30))
        distlst.append(hellinger(dist1, dist2))

    avgdist = sum(distlst)/len(distlst)
    print(avgdist)

In [76]:
def make_topics_bow(model, topic):
    # takes the string returned by model.show_topics()
    # split on strings to get topics and the probabilities
    topic = topic.split('+')
    # list to store topic bows
    topic_bow = []
    for word in topic:
        # split probability and word
        prob, word = word.split('*')
        # get rid of spaces
        word = word.replace(" ","")[1:-1]
        # convert to word_type
        word = model.id2word.doc2bow([word])[0][0]
        topic_bow.append((word, float(prob)))
    return topic_bow


In [77]:
def runlda(fileinput, NUMTOPICS=30, NUMPASSES=10, NUMITERATIONS=10):
    print('runlda...')
    from gensim.corpora import Dictionary
    from gensim.models.ldamodel import LdaModel
    from gensim.models.coherencemodel import CoherenceModel
    import numpy as np
    docs,word2freqtopics = [],{}
    fr = open(fileinput,'r')
    for line in fr:
        words = line.strip('\r\n').split(' ')
        docs.append(words)
        for word in words:
            if not word in word2freqtopics:
                word2freqtopics[word] = [0,[0. for i in range(NUMTOPICS)]]
            word2freqtopics[word][0] += 1
    fr.close()
    V = len(word2freqtopics)
    dct = Dictionary(docs)
    model = LdaModel(corpus=[dct.doc2bow(doc) for doc in docs], id2word=dct, num_topics=NUMTOPICS, passes=NUMPASSES, iterations=NUMITERATIONS)
    
    ### Evaluation ###
    
    # Computer Hellinger
    compute_hellinger(model, NUMTOPICS)
    
    # Compute Coherence Score using c_v
    coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dct, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score with c_v: ', coherence_lda)
    
    # Compute Coherence Score using UMass
    coherence_model_lda = CoherenceModel(model=model, corpus=[dct.doc2bow(doc) for doc in docs], coherence="u_mass")
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score with UMass: ', coherence_lda)

In [78]:
runlda('LDA/documents.txt', 30, 20, 20)
print("")
runlda('LDA/documents-15.txt', 30, 20, 20)
print("")
runlda('LDA/documents-all.txt', 30, 20, 20)
print("")

runlda...
0.5911229655989294

Coherence Score with c_v:  0.4548458497511928

Coherence Score with UMass:  -3.836796280455304

runlda...
0.6038398759487558

Coherence Score with c_v:  0.46985668337466957

Coherence Score with UMass:  -4.514284954247871

runlda...
0.6192977682697359

Coherence Score with c_v:  0.492495595642238

Coherence Score with UMass:  -4.2076082552698795

