In [1]:
from __future__ import unicode_literals
import re, glob, codecs, os
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess, lemmatize
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [3]:
articles_filenames = sorted(glob.glob("./Text/*.txt"))

In [4]:

d = list()
for article in articles_filenames:
    with open(article, encoding='utf-8') as f:
        book = os.path.basename(article.split('.')[0])
        d.append(pd.DataFrame({'text': book, 'lines': f.readlines()}))
doc = pd.concat(d)
doc.tail(1000)

Unnamed: 0,text,lines
365,,Ignore such things as pork and pies.\n
366,,And when the moon begins to wane there are all...
367,,"One, he told me. There's a creek you passed ov..."
368,,And some foolish people still think they are a...
369,,"We left our buggy and slipped out, in a skiff,..."
370,,And as night drooped her mantle over everythin...
371,,IX. A Fortnight In Foster.\n
372,,HOW good it is to leave the conventions and ar...
373,,I arrived at Foster.\n
374,,"On the station, I was introduced to a local st..."


In [5]:
# Convert to list
data = doc.lines.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:10])

['Australian Essays ',
 'Francis W. L. Adams ',
 'Melbourne ',
 'William Inglis &amp; Co. ',
 '1886 ',
 'To Matthew Arnold in England. ',
 'Master, with this I send you, as a boy ',
 'that watches from below some cross-bow bird ',
 'swoop on his quarry carried up aloft, ',
 'and cries a cry of victory to his flight ']


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:10])

[['australian', 'essays'], ['francis', 'adams'], ['melbourne'], ['william', 'inglis', 'amp', 'co'], [], ['to', 'matthew', 'arnold', 'in', 'england'], ['master', 'with', 'this', 'send', 'you', 'as', 'boy'], ['that', 'watches', 'from', 'below', 'some', 'cross', 'bow', 'bird'], ['swoop', 'on', 'his', 'quarry', 'carried', 'up', 'aloft'], ['and', 'cries', 'cry', 'of', 'victory', 'to', 'his', 'flight']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[10]]])




['with', 'sheer', 'joy', 'of', 'achievementso', 'to', 'you']


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])
nlp.max_length = 11000000
# def gensimlemm(texts):
#     texts_out = []
#     for sent in texts:
#         doc = " ".join(sent)
#         print(doc)
#         if len(doc) > 0:
#             lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc) if wd.decode('utf-8').split('/')[1]=='NN']
#             texts_out.append(lemmatized_out)
#     return texts_out
# # Do lemmatization keeping only noun
# data_lemmatized = gensimlemm(data_words_bigrams)

# Do lemmatization keeping only noun, adj, vb, adv
# data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])

print(data_lemmatized[:10])

[['essay'], ['francis'], [], ['ingli'], [], ['england'], ['master', 'boy'], [], ['swoop', 'quarry'], ['cry', 'victory', 'flight']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])


[[(0, 1)]]


In [12]:
id2word[100]

'intention'

In [13]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:100]]

[[('essay', 1)],
 [('francis', 1)],
 [],
 [('ingli', 1)],
 [],
 [('england', 1)],
 [('boy', 1), ('master', 1)],
 [],
 [('quarry', 1), ('swoop', 1)],
 [('cry', 1), ('flight', 1), ('victory', 1)],
 [('joy', 1)],
 [('sea', 1), ('voice', 1)],
 [('wave', 1), ('wind', 1)],
 [('acknowledgment', 1)],
 [('honour', 1), ('truth', 1)],
 [],
 [('content', 1)],
 [('page', 1)],
 [('preface', 1)],
 [],
 [('civilization', 1)],
 [],
 [('gordon', 1), ('poetry', 1)],
 [],
 [('salvation_army', 1)],
 [],
 [('civilization', 1)],
 [],
 [('culture', 1)],
 [],
 [('dialogue', 1)],
 [('introduction', 1)],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('preface', 1)],
 [('honour', 1),
  ('dialogue', 1),
  ('acquaintancenay', 1),
  ('admiration', 1),
  ('analyst', 1),
  ('anyone', 1),
  ('artist', 1),
  ('attempt', 1),
  ('author', 2),
  ('book', 2),
  ('character', 3),
  ('charge', 1),
  ('considering', 1),
  ('critic', 1),
  ('criticism', 1),
  ('declare', 1),
  ('disraeli', 2),
  ('endymion'

In [15]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [16]:
# View the topics in LDA model
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.036*"life" + 0.029*"fact" + 0.026*"year" + 0.023*"voice" + '
  '0.023*"thought" + 0.022*"lady" + 0.022*"foot" + 0.021*"science" + '
  '0.020*"idea" + 0.016*"point"'),
 (1,
  '0.137*"man" + 0.031*"woman" + 0.021*"fire" + 0.020*"hill" + '
  '0.018*"bushranger" + 0.018*"native" + 0.017*"bush" + 0.017*"jerry" + '
  '0.014*"friend" + 0.014*"wind"'),
 (2,
  '0.060*"nothing" + 0.047*"night" + 0.034*"poet" + 0.022*"person" + '
  '0.018*"sun" + 0.017*"love" + 0.017*"instance" + 0.017*"road" + 0.016*"gold" '
  '+ 0.016*"death"'),
 (3,
  '0.055*"thing" + 0.027*"something" + 0.027*"country" + 0.019*"reason" + '
  '0.017*"spirit" + 0.016*"mark" + 0.015*"truth" + 0.014*"art" + '
  '0.013*"culture" + 0.013*"sense"'),
 (4,
  '0.038*"people" + 0.030*"place" + 0.028*"head" + 0.024*"book" + '
  '0.022*"literature" + 0.022*"work" + 0.019*"house" + 0.018*"study" + '
  '0.017*"name" + 0.014*"manner"'),
 (5,
  '0.062*"hand" + 0.042*"eye" + 0.032*"mind" + 0.025*"part" + 0.024*"power" + '
  '0.018*"r

In [17]:
#Compute Model Perplexity and Coherence Score
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.623644026469496

Coherence Score:  0.5683633240990822


In [18]:
# Visualize the topics-keywords
# Visualize the topics
num_topics=10
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
html_name ='LDA_Vis_with_'+str(num_topics)+'_Topics.html'
pyLDAvis.save_html(vis, html_name )
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [20]:
# Building LDA Mallet Model
mallet_path = os.getenv('MALLET_HOME')+'/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
mallet_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet)
vis_mallet = pyLDAvis.gensim.prepare(mallet_model, corpus, id2word)
html_name ='LDA_Vis_Mallet_with_'+str(num_topics)+'_Topics.html'
pyLDAvis.save_html(vis_mallet, html_name )
vis_mallet

In [21]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('horse', 0.032761670383757595),
   ('tree', 0.022699889769027662),
   ('mile', 0.018406008869748008),
   ('bush', 0.017944576892511985),
   ('fire', 0.0166628214001897),
   ('side', 0.01648337563126458),
   ('river', 0.015945038324489222),
   ('ground', 0.014240303519700582),
   ('road', 0.01400958753108257),
   ('hill', 0.013907047091696788)]),
 (1,
  [('day', 0.052838836281262756),
   ('time', 0.03920768085975056),
   ('house', 0.03370253789724611),
   ('money', 0.017371491788381252),
   ('lady', 0.017068577223458756),
   ('work', 0.015501323604946727),
   ('mr', 0.015172068643074452),
   ('week', 0.013407262047439054),
   ('gentleman', 0.012011221009100606),
   ('evening', 0.011181498505182474)]),
 (2,
  [('night', 0.04420023375808888),
   ('day', 0.036517574617292395),
   ('hour', 0.019555859631118333),
   ('sun', 0.01567889620570712),
   ('air', 0.015536360785655236),
   ('light', 0.014510105761281679),
   ('bed', 0.014282049089198666),
   ('street', 0.013455343652897744)

In [None]:
#How to find the optimal number of topics for LDA?
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
#18. Finding the dominant topic in each sentence
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.tail(20)

In [None]:
#Find the most representative document for each topic
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

In [None]:
#Topic distribution across documents
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics