Code study based on:
https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [1]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

To create the article.json file, I download several financial documents, and save those in a json file named as 'article.json'

In [2]:
df = pd.read_json('article.json')

df.head()

Unnamed: 0,content
0,A surprisingly sharp rebound in the economy an...
1,There's a systematic underestimation of the ec...
2,"said Jonathan Golub, chief U.S. equity strate..."
3,"Everywhere I look, there's an upside,"
4,"added Golub, who on Tuesday raised his year-e..."


In [3]:
# Using re to remove some illegal signs
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = df.content.values.tolist()
data_words = list(sent_to_words(data))

In [4]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc))] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc))] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [5]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_model.print_topics())

[(0,
  '0.034*"be" + 0.015*"yield" + 0.015*"do" + 0.008*"have" + 0.008*"market" + '
  '0.008*"year" + 0.008*"high" + 0.008*"stock" + 0.008*"investor" + '
  '0.008*"growth"'),
 (1,
  '0.018*"stock" + 0.016*"here" + 0.015*"analysis" + 0.015*"tuesday" + '
  '0.014*"be" + 0.012*"year" + 0.011*"research" + 0.011*"pick" + 0.011*"right" '
  '+ 0.011*"access"'),
 (2,
  '0.026*"earning" + 0.026*"be" + 0.019*"year" + 0.019*"have" + '
  '0.015*"economy" + 0.015*"poll" + 0.012*"strategist" + 0.012*"end" + '
  '0.012*"gain" + 0.012*"likely"'),
 (3,
  '0.025*"be" + 0.018*"say" + 0.018*"company" + 0.013*"market" + '
  '0.013*"inflation" + 0.013*"year" + 0.011*"high" + 0.011*"yield" + '
  '0.010*"day" + 0.010*"tilson"')]


In [6]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis