In [14]:
import pandas as pd
import pickle
import nltk
from gensim import corpora, models
import pyLDAvis.gensim

import warnings
warnings.filterwarnings("ignore")

In [2]:
df_vec=pd.read_pickle('assets/df_vec.pkl')

In [3]:
# code help from J's lesson LDA_in_class
texts = [df_vec.columns[df_vec.loc[index,:].to_numpy().nonzero()] for index in df_vec.index]
dictionary = corpora.Dictionary(texts)      
corpus = [dictionary.doc2bow(text) for text in texts] 

In [4]:
pickle.dump(corpus, open('assets/corpus.pkl', 'wb'))
dictionary.save('assets/dictionary.gensim')

In [5]:
texts[0]

Index(['cookson eisenack', 'dinoflagel cyst', 'geolog societi',
       'geolog survei', 'lentin william', 'lower cretac', 'north sea',
       'shown figur', 'societi london', 'stover evitt'],
      dtype='object')

# Latent Dirichlet Allocation

In [6]:
lda_model = models.ldamodel.LdaModel(corpus,
                                            num_topics=6,
                                            id2word=dictionary,
                                            passes=20,
                                           eval_every = 5,
                                           decay = .4,
                                           offset = 60.0,
                                            minimum_probability = 0.05
                                           )
lda_model.save('assets/gensim_lda_model.gensim')

## Taking a closer look at the topics

In [8]:
topics = lda_model.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.031*"deep sea" + 0.027*"earli eocen" + 0.027*"paleocen eocen"')
(1, '0.037*"geolog societi" + 0.029*"geolog survei" + 0.028*"special public"')
(2, '0.023*"gulf coast" + 0.020*"geolog societi" + 0.019*"coastal plain"')
(3, '0.049*"spore pollen" + 0.048*"pollen spore" + 0.042*"dinoflagel cyst"')
(4, '0.027*"sedimentari structur" + 0.025*"grain size" + 0.024*"new york"')
(5, '0.070*"palaeoclimatolog palaeoecolog" + 0.068*"palaeogeographi palaeoclimatolog" + 0.046*"geol soc"')


## Scoring

In [9]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=texts,
                                     dictionary=dictionary,
                                     coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)   # higher is better
print('Perplexity:', lda_model.log_perplexity(corpus))  #lower is better

Coherence Score:  0.7529275071597289
Perplexity: -4.9752722579859


## Visualize LDA model in pyLDAvis

In [15]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)

In [16]:
#save viz to output file
pyLDAvis.save_html((pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)), 'pyldavis_output.html' )