In [13]:
import re
import spacy
from gensim import corpora # necessary for building dictionary and bag of words representation
from gensim import models # necessary for Phrases, LDA model and TFIDF

In [14]:
corpus = "snf_html_corpus_simple.txt"

raw_corpus = []

with open(corpus, "r", encoding="utf8") as inf:    
    doc = ""
    for line in inf:
        if not line.startswith("\n"): # we are inside a document
            line = re.sub("\xa0", " ", line.rstrip())
            doc += line.lower() # lower and add the line to the document string
        else: # we are at a document boundary
            if len(doc) > 380: # ignore shorter documents
                raw_corpus.append(doc) # should end up with 110 articles in list of docs 
            doc = ""

# inspect the raw_corpus:
# raw_corpus

In [15]:
corpus = [doc.split() for doc in raw_corpus]

In [16]:
nlp = spacy.load("en")

In [17]:
trained_phrases = models.phrases.Phrases(corpus) # train a phrase model
phraser = models.phrases.Phraser(trained_phrases) # initialise the trained phraser model
corpus_phrased = [phraser[text] for text in corpus] # apply phraser model to each text in corpus
# inspect the first text in the corpus after applying the phrase model
# corpus_phrased[0]

In [18]:
dictionary = corpora.Dictionary(corpus_phrased)
mapped_corpus = [dictionary.doc2bow(doc) for doc in corpus_phrased]
# inspect mapped_corpus
# corpus[1][0:10]

In [19]:
def print_topics_gensim(topic_model, total_topics=1, weight_threshold=0.0001, num_terms=None):
    """Adapted pretty print topic model results from Sakar (2016) Text Analytics With Python."""
    for index in range(total_topics):
        topic = topic_model.show_topic(index)
        topic = [(word, round(wt,4)) for word, wt in topic if abs(wt) >= weight_threshold]
        print('Topic #'+str(index+1))
        print(topic[:num_terms] if num_terms else topic)
        print()

def train_lda_model_gensim(corpus, total_topics=2):
    """Adapted pretty print topic model results from Sakar (2016) Text Analytics With Python."""
    dictionary = corpora.Dictionary(corpus)
    mapped_corpus = [dictionary.doc2bow(text) for text in corpus]
    tfidf = models.TfidfModel(mapped_corpus)
    corpus_tfidf = tfidf[mapped_corpus]
    lda = models.LdaModel(corpus_tfidf, id2word=dictionary, iterations=1000, num_topics=total_topics)
    return lda

In [20]:
def run_model_for_X_topics(X=2):
    lda_gensim = train_lda_model_gensim(corpus_phrased, total_topics=X)
    print_topics_gensim(topic_model=lda_gensim, total_topics=X, num_terms=10)

In [21]:
run_model_for_X_topics(10)

Topic #1
[('wings', 0.0003), ('retirees', 0.0003), ('conservation', 0.0003), ('asylum', 0.0003), ('paramecia', 0.0003), ('magnetic', 0.0003), ('holmium', 0.0003), ('nature', 0.0003), ('junk', 0.0002), ('sequences', 0.0002)]

Topic #2
[('patient', 0.0004), ('gravitational', 0.0003), ('public', 0.0003), ('science_studies', 0.0003), ('collections', 0.0003), ('indienne', 0.0003), ('games', 0.0003), ('species', 0.0003), ('justice', 0.0003), ('science', 0.0003)]

Topic #3
[('she', 0.0005), ('memristors', 0.0004), ('drugs', 0.0004), ('her', 0.0004), ('rudolf_von', 0.0004), ('i', 0.0003), ('language', 0.0003), ('temporary', 0.0003), ('women', 0.0003), ('populists', 0.0003)]

Topic #4
[('development', 0.0006), ('felli', 0.0005), ('caffeine', 0.0004), ('global', 0.0004), ('research', 0.0004), ("it's", 0.0003), ('sustainable', 0.0003), ('countries', 0.0003), ('her', 0.0003), ('international', 0.0003)]

Topic #5
[('selenium', 0.0006), ('brain', 0.0004), ('oscillations', 0.0003), ('paralytic', 0.00