In [7]:
import logging
import warnings

import pandas as pd
import numpy as np

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from gensim.corpora.dictionary import Dictionary
from gensim.models import Phrases
from gensim.models import LdaModel
from gensim.models import CoherenceModel
from pprint import pprint

SELECT_YEAR = 2017

DATA_PATH = '../data'
PLOT_PATH = '../docs/plots/{}'.format(SELECT_YEAR)

logging.basicConfig(format="%(levelname)s %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [8]:
# Data Read
data_file = '{}/{}_lattes.pkl.xz'.format(DATA_PATH, SELECT_YEAR)
%time df = pd.read_pickle(data_file, compression='xz')
print('{} docs shape'.format(df.shape))
df.head(5)

CPU times: user 74.1 ms, sys: 16 ms, total: 90.2 ms
Wall time: 96.8 ms
(8652, 5) docs shape


Unnamed: 0,id,titulo,clean,wct,wcc
79,104124422364023,Exploiting photo location and direction for cl...,"[exploit, photo, locat, direct, cluster, base,...",9,9
80,104124422364023,A gold-standard social media corpus for urban ...,"[gold, standard, social, media, corpus, urban,...",8,7
94,104124422364023,A Framework for Spatial Analytics using Hetero...,"[framework, spatial, analyt, use, heterogen, d...",9,7
140,105670521813027,Gene expression analysis in Musa acuminata dur...,"[gene, express, analysi, musa, acuminata, comp...",12,9
141,105670521813027,Mitogenome sequence accuracy using different e...,"[mitogenom, sequenc, accuraci, use, differ, el...",7,7


In [3]:
docs = list(df.clean)

bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

INFO 09:57:43: collecting all words and their counts
INFO 09:57:43: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO 09:57:43: collected 49313 word types from a corpus of 75656 words (unigram + bigrams) and 8652 sentences
INFO 09:57:43: using 49313 counts as vocab in Phrases<0 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>


In [4]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in docs]

print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

INFO 09:57:43: adding document #0 to Dictionary(0 unique tokens: [])
INFO 09:57:43: built Dictionary(8683 unique tokens: ['base', 'cluster', 'direct', 'discoveri', 'exploit']...) from 8652 documents (total 77652 corpus positions)
INFO 09:57:43: discarding 7908 tokens: [('interest', 12), ('photo', 6), ('corpus', 16), ('gold', 11), ('acuminata', 1), ('compat', 4), ('incognita', 1), ('meloidogyn', 1), ('musa', 1), ('elucid', 4)]...
INFO 09:57:43: keeping 775 tokens which were in no less than 20 and no more than 4326 (=50.0%) documents
INFO 09:57:43: resulting dictionary: Dictionary(775 unique tokens: ['base', 'cluster', 'direct', 'discoveri', 'exploit']...)
Number of unique tokens: 775
Number of documents: 8652


In [5]:
# Set training parameters.
num_topics = 20
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha=.1,
    #alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

abor"
INFO 09:58:20: topic #18 (0.100): 0.075*"multi" + 0.073*"optim" + 0.050*"power" + 0.046*"object" + 0.035*"low" + 0.034*"parallel" + 0.034*"effici" + 0.028*"multipl" + 0.026*"use" + 0.025*"differ"
INFO 09:58:20: topic #3 (0.100): 0.063*"measur" + 0.061*"use" + 0.059*"featur" + 0.055*"select" + 0.046*"base" + 0.038*"inform" + 0.034*"predict" + 0.027*"build" + 0.025*"composit" + 0.024*"event"
INFO 09:58:20: topic #19 (0.100): 0.184*"data" + 0.056*"problem" + 0.056*"new" + 0.029*"stream" + 0.028*"schedul" + 0.028*"algorithm" + 0.027*"approach" + 0.026*"heurist" + 0.023*"base" + 0.023*"scientif"
INFO 09:58:20: topic #16 (0.100): 0.167*"process" + 0.048*"relat" + 0.037*"genom" + 0.034*"busi" + 0.033*"learn" + 0.032*"correl" + 0.031*"area" + 0.030*"singl" + 0.026*"transfer" + 0.025*"analysi"
INFO 09:58:20: topic diff=0.019947, rho=0.216544
INFO 09:58:20: PROGRESS: pass 16, at document #8652/8652
INFO 09:58:21: merging changes from 652 documents into a model of 8652 documents
INFO 09:58:

In [6]:
top_topics = model.top_topics(corpus) # , num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

INFO 09:58:28: CorpusAccumulator accumulated stats from 1000 documents
INFO 09:58:28: CorpusAccumulator accumulated stats from 2000 documents
INFO 09:58:28: CorpusAccumulator accumulated stats from 3000 documents
INFO 09:58:28: CorpusAccumulator accumulated stats from 4000 documents
INFO 09:58:28: CorpusAccumulator accumulated stats from 5000 documents
INFO 09:58:28: CorpusAccumulator accumulated stats from 6000 documents
INFO 09:58:29: CorpusAccumulator accumulated stats from 7000 documents
INFO 09:58:29: CorpusAccumulator accumulated stats from 8000 documents
Average topic coherence: -10.3684.
[([(0.056757476, 'learn'),
   (0.046844088, 'environ'),
   (0.045141548, 'support'),
   (0.037409183, 'interact'),
   (0.03520138, 'virtual'),
   (0.03326227, 'model'),
   (0.031308163, 'framework'),
   (0.031144893, 'brazilian'),
   (0.029663369, 'collabor'),
   (0.029566228, 'architectur'),
   (0.02941902, 'awar'),
   (0.027589697, 'experi'),
   (0.026949156, 'base'),
   (0.025955357, 'contex

In [7]:
# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=model, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

INFO 09:58:32: -6.477 per-word bound, 89.1 perplexity estimate based on a held-out corpus of 8652 documents with 53291 words
INFO 09:58:32: using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows

Perplexity:  -6.477441019881129
INFO 09:58:33: serializing accumulator to return to master...
INFO 09:58:33: serializing accumulator to return to master...
INFO 09:58:33: serializing accumulator to return to master...
INFO 09:58:33: accumulator serialized
INFO 09:58:33: accumulator serialized
INFO 09:58:33: accumulator serialized
INFO 09:58:33: 3 accumulators retrieved from output queue
INFO 09:58:33: accumulated word occurrence stats for 8458 virtual documents

Coherence Score:  0.3607534164248959


In [8]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(model, corpus, dictionary)
vis

INFO 09:58:36: NumExpr defaulting to 4 threads.
