https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
df = pd.read_pickle('metatable_preprocessed.pkl')

In [3]:
df.head()

Unnamed: 0,filename,title,year,author,years_of_life,time_summary,time_book,name,username,tradition_country,tradition,country,text,text_tokenized,text_pymystem_list,text_pymystem
0,texts/ablesimov_melnik_koldun_obmanshchik_i_sv...,"Мельник — колдун, обманщик и сват",,Аблесимов,1742–1783,Читается за 6 минут,40 мин,,,Русская литература->18 век,Русская литература,18 век,На краю леса перед мельницей мельник Фаддей ст...,"[на, краю, леса, перед, мельницей, мельник, фа...","[на, край, лес, перед, мельница, мельник, фадд...",на край лес перед мельница мельник фаддей стро...
1,texts/abramov_o_chem_plachut_loshadi.txt,О чём плачут лошади,1972.0,Абрамов,1920–1983,Читается за 2 минуты,11 мин,Сергей Симиненко,siminenko,Русская литература->Советская,Русская литература,Советская,"Рассказчик любит лошадей, которым живётся очен...","[рассказчик, любит, лошадей, которым, живётся,...","[рассказчик, любить, лошадь, который, житься, ...",рассказчик любить лошадь который житься очень ...
2,texts/abramov_alka.txt,Алька,1972.0,Абрамов,1920–1983,Читается за 6 минут,"1,5 ч",,,Русская литература->Советская,Русская литература,Советская,Лето. В последний раз главная героиня Аля Амос...,"[лето, последний, раз, главная, героиня, аля, ...","[лето, в, последний, раз, главный, героиня, ал...",лето в последний раз главный героиня аля амосо...
3,texts/abramov_bratja_i_sestry.txt,Братья и сёстры,1972.0,Абрамов,1920–1983,Читается за 3 минуты,7 ч,,,Русская литература->Советская,Русская литература,Советская,Пекашинский мужик Степан Андреянович Ставров с...,"[пекашинский, мужик, степан, андреянович, став...","[пекашинский, мужик, степан, андреянович, став...",пекашинский мужик степан андреянович ставр сру...
4,texts/abe_zhencshina_v_peskah.txt,Женщина в песках,1962.0,Абэ,1924–1993,Читается за 9 минут,4 ч,В. С. Санович,sanovich,Прочая литература->Японская,Прочая литература,Японская,Однажды в августе человек отправляется в трехд...,"[однажды, августе, человек, отправляется, трех...","[однажды, в, август, человек, отправляться, в,...",однажды в август человек отправляться в трехдн...


In [4]:
%%time

# Build the bigram and trigram models
bigram = gensim.models.Phrases(df['text_pymystem_list'], 
                               min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(df['text_pymystem_list'], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 57.1 s, sys: 295 ms, total: 57.4 s
Wall time: 57.4 s


In [5]:
trigram_mod[bigram_mod[df['text_pymystem_list'][2]]][0:20]

['лето',
 'в',
 'последний',
 'раз',
 'главный_героиня',
 'аля',
 'амосов',
 'быть',
 'в',
 'родной',
 'деревня',
 'летовка',
 'в',
 'прошлый',
 'год',
 'на',
 'похороны',
 'мать',
 'теперь',
 'она']

In [6]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

### Create the Dictionary and Corpus needed for Topic Modeling

In [7]:
%%time

id2word = corpora.Dictionary(df['text_pymystem_list'])

# Create Corpus
data_words_bigrams = make_bigrams(df['text_pymystem_list'])

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_words_bigrams]

CPU times: user 16.7 s, sys: 231 ms, total: 17 s
Wall time: 18.3 s


In [8]:
corpus[:1]

[[(0, 5),
  (1, 13),
  (2, 22),
  (3, 1),
  (4, 1),
  (5, 3),
  (6, 1),
  (7, 1),
  (8, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 5),
  (14, 11),
  (15, 1),
  (16, 3),
  (17, 5),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 3),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 4),
  (29, 3),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (35, 5),
  (36, 4),
  (37, 1),
  (38, 3),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 2),
  (45, 1),
  (46, 2),
  (47, 3),
  (48, 4),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 2),
  (55, 7),
  (56, 2),
  (57, 2),
  (58, 1),
  (59, 2),
  (60, 1),
  (61, 1),
  (62, 3),
  (63, 2),
  (64, 6),
  (65, 3),
  (66, 2),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 2),
  (77, 1),
  (78, 2),
  (79, 1),
  (80, 1),
  (81, 7),
  (82, 2),
  (83, 2),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 2),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 3),
  (92, 1),
  (93

In [9]:
id2word[50]

'гулять'

In [10]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('а', 5),
  ('анкудин', 13),
  ('анюта', 22),
  ('балалайка', 1),
  ('бежать', 1),
  ('без', 3),
  ('безделье', 1),
  ('бесплатенмельник', 1),
  ('благодарить', 1),
  ('благословлять', 1),
  ('боярин', 1),
  ('брифли', 1),
  ('быть', 5),
  ('в', 11),
  ('вдали', 1),
  ('ведь', 3),
  ('велеть', 5),
  ('верить', 1),
  ('вернуться', 1),
  ('весть', 1),
  ('ветер', 1),
  ('видение', 1),
  ('видеть', 3),
  ('вино', 1),
  ('внешне', 1),
  ('возвращение', 1),
  ('вокруг', 1),
  ('вопрос', 4),
  ('ворожба', 3),
  ('ворожить', 1),
  ('ворота', 1),
  ('вперед', 1),
  ('вращать', 1),
  ('все', 5),
  ('встречать', 4),
  ('вступать', 1),
  ('выбор', 3),
  ('вызывать', 1),
  ('выносить', 1),
  ('выпивать', 1),
  ('выпроваживать', 1),
  ('выражать', 1),
  ('вырастать', 2),
  ('выслушивать', 1),
  ('выходить', 2),
  ('глаз', 3),
  ('говорить', 4),
  ('гоняться', 1),
  ('гулять', 1),
  ('давать', 1),
  ('давно', 2),
  ('даже', 1),
  ('двор', 2),
  ('дворянин', 7),
  ('дворянский', 2),
  ('девушка', 2

### Building the Topic Model

In [12]:
%%time

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 3min 5s, sys: 4min 5s, total: 7min 10s
Wall time: 2min 16s


### View the topics in LDA model

In [13]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.067*"я" + 0.019*"мы" + 0.013*"и" + 0.012*"в" + 0.010*"мой" + 0.009*"быть" '
  '+ 0.007*"не" + 0.007*"андрей" + 0.006*"русский" + 0.006*"петр"'),
 (1,
  '0.051*"и" + 0.021*"он" + 0.019*"я" + 0.016*"не" + 0.014*"в" + 0.013*"быть" '
  '+ 0.012*"бог" + 0.011*"они" + 0.011*"а" + 0.010*"на"'),
 (2,
  '0.015*"григорий" + 0.011*"в" + 0.009*"война" + 0.008*"генерал" + '
  '0.008*"солдат" + 0.008*"петр" + 0.008*"армия" + 0.007*"сергей" + '
  '0.007*"виктор" + 0.007*"фронт"'),
 (3,
  '0.068*"она" + 0.022*"ее" + 0.019*"в" + 0.018*"и" + 0.014*"свой" + '
  '0.014*"дочь" + 0.013*"с" + 0.011*"он" + 0.010*"девушка" + 0.010*"любовь"'),
 (4,
  '0.025*"и" + 0.019*"на" + 0.015*"мальчик" + 0.012*"в" + 0.011*"иван" + '
  '0.007*"быть" + 0.007*"деревня" + 0.007*"а" + 0.007*"не" + 0.006*"решать"'),
 (5,
  '0.009*"розенкранц" + 0.008*"итен" + 0.008*"равик" + 0.008*"петенька" + '
  '0.008*"санчо" + 0.007*"гильденстерн" + 0.006*"съезд" + 0.006*"дик" + '
  '0.006*"хоакин" + 0.006*"корделия"'),
 (6,
  '0.

### Compute Model Perplexity and Coherence Score

In [15]:
%%time

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=df['text_pymystem_list'],
                                     dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.789358766613633

Coherence Score:  0.4075117219154987
CPU times: user 27.6 s, sys: 32.8 s, total: 1min
Wall time: 1min 26s


### Visualize the topics-keywords

In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

### Building LDA Mallet Model

In [None]:
# проблема с Mallet: 
#                'CalledProcessError: Command '/mallet-2.0.8/bin/mallet import-file --preserve-case 
#                 --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/84e43d_corpus.txt 
#                 --output /tmp/84e43d_corpus.mallet' returned non-zero exit status 127'

In [None]:
%%time

# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = '/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

### How to find the optimal number of topics for LDA?

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus,
                                                 num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
%%time

# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word,
                                                        corpus=corpus,
                                                        texts=df['text_pymystem_list'],
                                                        start=2, limit=40, step=6)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

### Finding the dominant topic in each sentence

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4),
                                                                  topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

### Find the most representative document for each topic

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

### Topic distribution across documents

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics