# Auxiliary notebook for training multiple topic models (LDA, NMF)
The notebook serves as an auxiliary tool for training both LDA and NMF models.  
User can specify a list of number of topics. The result is a set of models for each of the specified topics.

In [None]:
import numpy as np
from gensim.corpora import Dictionary
from tqdm.notebook import tqdm
import os
import json
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
def serialize_model(model, fpath):
    dpath, fname = os.path.split(fpath)

    if not os.path.exists(dpath):
        os.makedirs(dpath)
    model.save(fpath)

## I. Corpora

In [None]:
class JSONCorpus:
    
    def __init__(self, dpath):
        self.dpath = dpath
        self.dictionary = Dictionary(self._gen_documents())
        
    def _gen_documents(self):
        # An auxiliary generator
        for fname in os.listdir(self.dpath):
            with open(os.path.join(self.dpath, fname), 'r') as file:
                tokenized_doc = json.load(file)   
                yield tokenized_doc
        
    def __iter__(self):
        for doc in self._gen_documents():
            yield doc
            
class BoWCorpus:
    
    def __init__(self, corpus, dictionary):
        self.corpus = corpus
        self.dictionary = dictionary
        
    def __iter__(self):
        for doc in self.corpus:
            yield self.dictionary.doc2bow(doc)

In [None]:
DIR_PREPROCESSED_DATA = 'preprocessed_data'

In [None]:
corpus = JSONCorpus(DIR_PREPROCESSED_DATA)

In [None]:
MIN_DF = 5
MAX_DF_RATIO = 0.5

In [None]:
num_orig = len(corpus.dictionary)
corpus.dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF_RATIO, keep_n=None)

print(f'Number of tokens before filtering: {num_orig}')
print(f'Total number of filtered tokens: {num_orig - len(corpus.dictionary)}')
print(f'Number of tokens after filtering: {len(corpus.dictionary)}')

In [None]:
bow_corpus = BoWCorpus(corpus, corpus.dictionary)

### TF-IDF model

In [None]:
from gensim.models.tfidfmodel import TfidfModel

In [None]:
TFIDF_SMARTIRS = 'ltc'

In [None]:
tfidf_model = TfidfModel(corpus=bow_corpus, dictionary=bow_corpus.dictionary, smartirs=TFIDF_SMARTIRS)
tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in tqdm(bow_corpus, total=bow_corpus.dictionary.num_docs)]

In [None]:
serialize_model(tfidf_model, os.path.join('models', 'gensim', 'tfidf', 'tfidf.model'))

## III. Topic models

In [None]:
from gensim.models.ldamodel import LdaModel
from gensim.models.nmf import Nmf

import pyLDAvis
import pyLDAvis.gensim

In [None]:
def create_lda_pyldavis(lda_model, bow_corpus, fpath):
    # Create an in-memory corpus
    inmemory_bow_corpus = list(bow_corpus)
    
    # pyLDAvis data preparation
    lda_vis_data = pyLDAvis.gensim.prepare(lda_model, inmemory_bow_corpus, bow_corpus.dictionary)
    
    # Output: HTML
    dpath, fname = os.path.split(fpath)
    if not os.path.exists(dpath):
        os.makedirs(dpath)
    pyLDAvis.save_html(lda_vis_data, fpath)

def create_nmf_pyldavis(nmf_model, tfidf_corpus, dictionary, fpath):
    topic_term_dists = nmf_model.get_topics()
    vocab = np.array([dictionary.id2token[i] for i in range(len(dictionary))])
    term_frequency = np.array([dictionary.cfs[i] for i in range(len(dictionary))])
    doc_topic_dists = np.zeros(shape=(dictionary.num_docs, nmf_model.num_topics))
    doc_lengths = np.zeros(shape=(dictionary.num_docs,))
    
    for i, doc_tfidf in enumerate(tfidf_corpus):
        topic_dist = nmf_model.get_document_topics(doc_tfidf)
        for x in topic_dist:
            doc_topic_dists[i, x[0]] = x[1]
        doc_lengths[i] = len(doc_tfidf)
        
    # Normalization: row sum must be equal to one
    topic_term_dists = topic_term_dists / topic_term_dists.sum(axis=1)[:, None]
    doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)[:, None]
    
    # Empty document filtering
    mask = (doc_lengths != 0)
    doc_topic_dists = doc_topic_dists[mask]
    doc_lengths = doc_lengths[mask]
    
    # pyLDAvis data preparation
    nmf_vis_data = pyLDAvis.prepare(topic_term_dists=topic_term_dists, 
                                    doc_topic_dists=doc_topic_dists, 
                                    doc_lengths=doc_lengths, 
                                    vocab=vocab, 
                                    term_frequency=term_frequency)
    
    # Output: HTML
    dpath, fname = os.path.split(fpath)
    if not os.path.exists(dpath):
        os.makedirs(dpath)
    pyLDAvis.save_html(nmf_vis_data, fpath)

In [None]:
UPDATE_EVERY = 1 # Online learning
NUM_PASSES = 5 # Sufficient - selected during convergence monitoring
NUM_ITERATIONS = 200 # Sufficient - selected during convergence monitoring
CHUNK_SIZE = 2000
RANDOM_STATE = 42
LIST_NUM_TOPICS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [None]:
for num_topics in tqdm(LIST_NUM_TOPICS):
    # LDA model
    lda_model = LdaModel(corpus=bow_corpus,
                         id2word=bow_corpus.dictionary, 
                         num_topics=num_topics, 
                         passes=NUM_PASSES, 
                         iterations=NUM_ITERATIONS,
                         chunksize=CHUNK_SIZE, 
                         random_state=RANDOM_STATE, 
                         update_every=UPDATE_EVERY)
    serialize_model(model=lda_model, fpath=os.path.join('models', 'gensim', 'lda', f'lda_{num_topics}', f'lda_{num_topics}.model'))
    create_lda_pyldavis(lda_model, bow_corpus, fpath=os.path.join('pyldavis', 'lda', f'lda_{num_topics}.html'))
    
    # NMF model
    nmf_model = Nmf(corpus=tfidf_corpus,
                    id2word=bow_corpus.dictionary, 
                    num_topics=num_topics, 
                    passes=NUM_PASSES, 
                    chunksize=CHUNK_SIZE, 
                    random_state=RANDOM_STATE)
    serialize_model(model=nmf_model, fpath=os.path.join('models', 'gensim', 'nmf', f'nmf_{num_topics}', f'nmf_{num_topics}.model'))
    create_nmf_pyldavis(nmf_model=nmf_model, tfidf_corpus=tfidf_corpus, dictionary=bow_corpus.dictionary, fpath=os.path.join('pyldavis', 'nmf', f'nmf_{num_topics}.html'))