# Topic modeling using Non-negative matrix factorization (NMF)

In [None]:
import numpy as np
from gensim.corpora import Dictionary
from tqdm.notebook import tqdm
import os
import json
import warnings

In [None]:
warnings.filterwarnings('ignore')

## I. Corpora

In [None]:
class JSONCorpus:
    
    def __init__(self, dpath):
        self.dpath = dpath
        self.dictionary = Dictionary(self._gen_documents())
        
    def _gen_documents(self):
        # An auxiliary generator
        for fname in os.listdir(self.dpath):
            with open(os.path.join(self.dpath, fname), 'r') as file:
                tokenized_doc = json.load(file)   
                yield tokenized_doc
        
    def __iter__(self):
        for doc in self._gen_documents():
            yield doc
            
class BoWCorpus:
    
    def __init__(self, corpus, dictionary):
        self.corpus = corpus
        self.dictionary = dictionary
        
    def __iter__(self):
        for doc in self.corpus:
            yield self.dictionary.doc2bow(doc)

In [None]:
DIR_PREPROCESSED_DATA = 'preprocessed_data'

In [None]:
corpus = JSONCorpus(DIR_PREPROCESSED_DATA)

In [None]:
MIN_DF = 5
MAX_DF_RATIO = 0.5

In [None]:
num_orig = len(corpus.dictionary)
corpus.dictionary.filter_extremes(no_below=MIN_DF, no_above=MAX_DF_RATIO, keep_n=None)

print(f'Number of tokens before filtering: {num_orig}')
print(f'Total number of filtered tokens: {num_orig - len(corpus.dictionary)}')
print(f'Number of tokens after filtering: {len(corpus.dictionary)}')

In [None]:
bow_corpus = BoWCorpus(corpus, corpus.dictionary)

## II. TF-IDF model

In [None]:
from gensim.models.tfidfmodel import TfidfModel

In [None]:
TFIDF_SMARTIRS = 'ltc'

In [None]:
tfidf_model = TfidfModel(corpus=bow_corpus, dictionary=corpus.dictionary, smartirs=TFIDF_SMARTIRS)
tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in tqdm(bow_corpus, total=bow_corpus.dictionary.num_docs)]

## III. NMF model

In [None]:
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
# NMF model parameters
NUM_PASSES = 5 
CHUNK_SIZE = 2000
RANDOM_STATE = 42
LIST_NUM_TOPICS = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# Coherence model parameters
COH_METRIC = 'c_v'
COH_NUM_PROCESSES = 16

In [None]:
coherence_scores = {}

for num_topics in tqdm(LIST_NUM_TOPICS):
    model = Nmf(corpus=tfidf_corpus,
                id2word=corpus.dictionary, 
                num_topics=num_topics, 
                passes=NUM_PASSES, 
                chunksize=CHUNK_SIZE, 
                random_state=RANDOM_STATE)
    coherence_model = CoherenceModel(model, texts=corpus, dictionary=corpus.dictionary, coherence=COH_METRIC, processes=COH_NUM_PROCESSES)
    coherence_scores[num_topics] = coherence_model.get_coherence()
    print(f'Num topics: {num_topics} | Coherence: {coherence_scores[num_topics]}')

In [None]:
NUM_TOPICS = max(coherence_scores, key=coherence_scores.get)
print(f'Recommended number of topics (based on Coherence score): {NUM_TOPICS}')

In [None]:
x = list(coherence_scores.keys())
y = list(coherence_scores.values())


plt.figure(figsize=(12, 8))
plt.plot(x, y)
plt.title('Coherence score vs. number of topics')
plt.xlabel('Number of topics')
plt.ylabel('Coherence score')
plt.show()

In [None]:
final_model = Nmf(corpus=tfidf_corpus,
                  id2word=corpus.dictionary, 
                  num_topics=NUM_TOPICS, 
                  passes=NUM_PASSES, 
                  chunksize=CHUNK_SIZE, 
                  random_state=RANDOM_STATE)

## IV. Model evaluation

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.gensim

In [None]:
pyLDAvis.enable_notebook()

In [None]:
def get_dominant_topic(doc_bow):
    """Returns a list of the most dominant topics based on topic probability, one per each document from the input corpus.
    """
    return sorted(final_model.get_document_topics(doc_bow), key=lambda x: x[1], reverse=True)[0][0]

def create_df(corpus, num_documents):
    """Creates an auxiliary data frame which holds data for further visualizations.
    """
    words = []
    word_counts = []
    dominant_topics = []
    
    for doc in tqdm(corpus, total=num_documents):
        doc_bow = corpus.dictionary.doc2bow(doc)
        words.append(doc)
        word_counts.append(len(doc))
        try:
            dominant_topics.append(get_dominant_topic(doc_bow))
        except:
            dominant_topics.append(0)
        
    return pd.DataFrame({'words': words, 
                         'word_count': word_counts, 
                         'dominant_topic': dominant_topics})

In [None]:
df_documents = create_df(corpus=corpus, num_documents=corpus.dictionary.num_docs)

### Distribution of document word counts

In [None]:
plt.figure(figsize=(14, 7))
sns.distplot(df_documents['word_count'])
plt.title('Distribution of document word counts')
plt.xlabel('Word count')
plt.show()

### Distribution of topic sizes

In [None]:
plt.figure(figsize=(14, 7))
sns.countplot(df_documents['dominant_topic'])
plt.title('Number of documents per topic')
plt.xlabel('Topic ID')
plt.show()

In [None]:
counter = Counter(df_documents['dominant_topic'])

In [None]:
MC_6_TOPICS = [x[0] for x in counter.most_common(6)]
LC_6_TOPICS = [x[0] for x in counter.most_common()[-1:-7:-1]]

### Distributions of word counts across 6 most frequent topics

In [None]:
nrow = 2
ncol = 3

fig, ax = plt.subplots(nrow, ncol, figsize=(23, 10), sharex='row')

for i in range(nrow):
    for j in range(ncol):
        topic_id = MC_6_TOPICS[(i+1) * j]
        ax[i, j].set_title(f'Topic: {topic_id}')
        sns.distplot(df_documents.query(f'dominant_topic == {topic_id}')['word_count'], ax=ax[i, j])
        ax[i, j].set_xlabel('Word count')

### Distributions of word counts across 6 least frequent topics

In [None]:
nrow = 2
ncol = 3

fig, ax = plt.subplots(nrow, ncol, figsize=(23, 10), sharex='row')

for i in range(nrow):
    for j in range(ncol):
        topic_id = LC_6_TOPICS[(i+1) * j]
        ax[i, j].set_title(f'Topic: {topic_id}')
        sns.distplot(df_documents.query(f'dominant_topic == {topic_id}')['word_count'], ax=ax[i, j])
        ax[i, j].set_xlabel('Word count')

### Word clouds for 6 most frequent topics

In [None]:
nrow = 2
ncol = 3
fig, ax = plt.subplots(nrow, ncol, figsize=(25, 10), sharex='row')
aux_id = 0

for i in range(nrow):
    for j in range(ncol):
        topic_id = MC_6_TOPICS[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}')
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')

### Word clouds for 6 least frequent topics

In [None]:
nrow = 2
ncol = 3
fig, ax = plt.subplots(nrow, ncol, figsize=(25, 10), sharex='row')
aux_id = 0

for i in range(nrow):
    for j in range(ncol):
        topic_id = LC_6_TOPICS[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}')
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')

### Word clouds for all topics

In [None]:
# 30 topics

nrow = 5
ncol = 4
fig, ax = plt.subplots(nrow, ncol, figsize=(60, 40), sharex='row', constrained_layout=True)
aux_id = 0

topic_ids = range(20)

for i in range(nrow):
    for j in range(ncol):
        topic_id = topic_ids[(aux_id)]
        aux_id += 1
        wordcloud = WordCloud(background_color='white', collocations=False, max_words=20).generate_from_frequencies(dict(final_model.show_topic(topic_id, topn=20)))
        ax[i, j].set_title(f'Topic: {topic_id}', fontsize=45)
        ax[i, j].imshow(wordcloud)
        ax[i, j].axis('off')
plt.savefig(f'word_clouds_nmf_{NUM_TOPICS}.png') # Save into a file