# Auxiliary notebook for Topic Coherence calculation

The notebook serves as an auxiliary tool for calculating so-called Coherence Score for both LDA and NMF models (pre-trained).

In [None]:
import numpy as np
from gensim.corpora import Dictionary
from tqdm.notebook import tqdm
import os
import json
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
TARGET_DIR = 'coherence_scores'

In [None]:
if not os.path.exists(TARGET_DIR):
    os.makedirs(TARGET_DIR)

## I. Corpus

In [None]:
class JSONCorpus:
    
    def __init__(self, dpath):
        self.dpath = dpath
        self.dictionary = Dictionary(self._gen_documents())
        
    def _gen_documents(self):
        # An auxiliary generator
        for fname in os.listdir(self.dpath):
            with open(os.path.join(self.dpath, fname), 'r') as file:
                tokenized_doc = json.load(file)   
                yield tokenized_doc
        
    def __iter__(self):
        for doc in self._gen_documents():
            yield doc

In [None]:
DIR_PREPROCESSED_DATA = 'preprocessed_data'

In [None]:
corpus = JSONCorpus(DIR_PREPROCESSED_DATA)

In [None]:
MIN_DF = 5
MAX_DF_RATIO = 0.5

In [None]:
num_orig = len(corpus.dictionary)
corpus.dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF_RATIO, keep_n=None)

print(f'Number of tokens before filtering: {num_orig}')
print(f'Total number of filtered tokens: {num_orig - len(corpus.dictionary)}')
print(f'Number of tokens after filtering: {len(corpus.dictionary)}')

## II. Coherence calculation

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.nmf import Nmf
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Where are the pre-trained models? :-)
DIR_LDA_MODELS = os.path.join('models', 'gensim', 'lda')
DIR_NMF_MODELS = os.path.join('models', 'gensim', 'nmf')

# Coherence model parameters
COH_METRIC = 'c_v'
COH_NUM_PROCESSES = 16

### LDA

In [None]:
lda_coherence_scores = []

for subdir in tqdm(os.listdir(DIR_LDA_MODELS)):
    lda_model = LdaModel.load(os.path.join(DIR_LDA_MODELS, subdir, f'{subdir}.model'))
    coherence_model = CoherenceModel(lda_model, texts=corpus, dictionary=corpus.dictionary, coherence=COH_METRIC, processes=COH_NUM_PROCESSES)
    lda_coherence_scores.append((lda_model.num_topics, coherence_model.get_coherence()))

In [None]:
df_lda_res = pd.DataFrame(sorted(lda_coherence_scores), columns=['num_topics', 'coherence_score_cv'])
df_lda_res

In [None]:
plt.figure(figsize=(14, 8))
plt.title('Coherence vs. number of topics (LDA)')
plt.plot(df_lda_res['num_topics'], df_lda_res['coherence_score_cv'])
plt.xlabel('Number of topics')
plt.ylabel('Coherence score (CV)')
plt.xticks(ticks=range(0, 101, 10))
plt.savefig(os.path.join(TARGET_DIR, 'lda_coherence_plot.png'))

In [None]:
df_lda_res.to_csv(os.path.join(os.path.join(TARGET_DIR, 'lda_coherence_scores.csv')), index=False)

### NMF

In [None]:
nmf_coherence_scores = []

for subdir in tqdm(os.listdir(DIR_NMF_MODELS)):
    nmf_model = Nmf.load(os.path.join(DIR_NMF_MODELS, subdir, f'{subdir}.model'))
    coherence_model = CoherenceModel(nmf_model, texts=corpus, dictionary=corpus.dictionary, coherence=COH_METRIC, processes=COH_NUM_PROCESSES)
    nmf_coherence_scores.append((nmf_model.num_topics, coherence_model.get_coherence()))

In [None]:
nmf_coherence_scores

In [None]:
df_nmf_res = pd.DataFrame(sorted(nmf_coherence_scores), columns=['num_topics', 'coherence_score_cv'])
df_nmf_res

In [None]:
plt.figure(figsize=(14, 8))
plt.title('Coherence vs. number of topics (NMF)')
plt.plot(df_nmf_res['num_topics'], df_nmf_res['coherence_score_cv'])
plt.xlabel('Number of topics')
plt.ylabel('Coherence score (CV)')
plt.xticks(ticks=range(0, 101, 10))
plt.savefig(os.path.join(TARGET_DIR, 'nmf_coherence_plot.png'))

In [None]:
df_nmf_res.to_csv(os.path.join(os.path.join(TARGET_DIR, 'nmf_coherence_scores.csv')), index=False)