In [1]:
import os
import gensim
from gensim import models
from gensim.models import CoherenceModel
import gensim.corpora as corpora
from pprint import pprint

from gensim.corpora.textcorpus import TextCorpus
from gensim.test.utils import datapath
from gensim import utils
import pandas as pd
import numpy as np

In [2]:
current_path = os.path.dirname(os.path.abspath("__file__"))
newsgroups_processed_data_loc = f"{current_path}/../data/processed/20_newsgroups/"
test_docs = [ele for ele in os.listdir(newsgroups_processed_data_loc) if 'test' in ele]
train_docs = [ele for ele in os.listdir(newsgroups_processed_data_loc) if 'train' in ele]

In [3]:
class corpus_read(TextCorpus):
    stopwords = set('for a of the and to in on'.split())

    def get_texts(self):
        for doc in self.getstream():
            yield [word for word in utils.to_unicode(doc).lower().split() if word not in self.stopwords]

    def __len__(self):
        self.length = sum(1 for _ in self.get_texts())
        return self.length

In [4]:
doc_coherence_score_dict = {}
for test_doc in test_docs:
    print(test_doc, end=' - ')
    loc = newsgroups_processed_data_loc + test_doc
    
    texts = [text for text in corpus_read(datapath(loc)).get_texts()]
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    n_topics = 20
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=n_topics)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('coherence score: ', coherence_lda)
    doc_coherence_score_dict[test_doc] = coherence_lda

test_norm.txt - coherence score:  0.40457549674444027
test_lemma.txt - coherence score:  0.4464866114467193
test_no_stop.txt - coherence score:  0.47180629516285527
test_lemma_no_stop.txt - coherence score:  0.49356349928001786
test_np_lemma_only.txt - coherence score:  0.5207310089906139
test_np_lemma.txt - coherence score:  0.4395579835256781
test_np_no_stop.txt - coherence score:  0.5143619860424391
test_np_no_stop_only.txt - coherence score:  0.49548898187542845
test_np_lemma_no_stop.txt - coherence score:  0.4932273867731273


In [5]:
for train_doc in train_docs:
    print(train_doc, end=' - ')
    loc = newsgroups_processed_data_loc + train_doc
    
    texts = [text for text in corpus_read(datapath(loc)).get_texts()]
    id2word = corpora.Dictionary(texts)
    corpus = [id2word.doc2bow(text) for text in texts]
    n_topics = 20
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=n_topics)
    
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('coherence score: ', coherence_lda)
    doc_coherence_score_dict[train_doc] = coherence_lda

train_norm.txt - coherence score:  0.4560275354213529
train_lemma.txt - coherence score:  0.46645915274220123
train_no_stop.txt - coherence score:  0.5369845373822405
train_lemma_no_stop.txt - coherence score:  0.5218632086370676
train_np_lemma.txt - coherence score:  0.4610959441630717
train_np_lemma_only.txt - coherence score:  0.5138049077732546
train_np_no_stop_only.txt - coherence score:  0.5308114580042114
train_np_no_stop.txt - coherence score:  0.5601706561705528
train_np_lemma_no_stop.txt - coherence score:  0.5629877120068836


In [8]:
foo = pd.DataFrame.from_dict(doc_coherence_score_dict, orient='index')
foo

Unnamed: 0,0
test_norm.txt,0.404575
test_lemma.txt,0.446487
test_no_stop.txt,0.471806
test_lemma_no_stop.txt,0.493563
test_np_lemma_only.txt,0.520731
test_np_lemma.txt,0.439558
test_np_no_stop.txt,0.514362
test_np_no_stop_only.txt,0.495489
test_np_lemma_no_stop.txt,0.493227
train_norm.txt,0.456028


In [10]:
foo.loc[['np' not in ele for ele in foo.index],:].mean()

0    0.474721
dtype: float64

In [11]:
foo.loc[['np' in ele for ele in foo.index],:].mean()

0    0.509224
dtype: float64