In [2]:
%matplotlib inline
import pandas as pd
import pyLDAvis.gensim
import warnings
import matplotlib.pyplot as plt
import re
import gensim
from gensim import corpora, models
import logging


pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')

logging.basicConfig(filename='LDA/logs/glda_single_tweet.log', level=logging.DEBUG, format='%(asctime)s %(message)s')

In [3]:
# read dictionary and corpora
logging.debug("reading dictionary")
dictionary = corpora.Dictionary.load('LDA/data/single_tweet_doc_AmericanCrime_1000_dict.dict')
print(dictionary)

Dictionary(13649 unique tokens: [u'abctvevent', u'yellow', u'jihad', u'hanging', u'woody']...)


In [4]:
corpus = corpora.MmCorpus('LDA/data/single_tweet_doc_AmericanCrime_1000_corpus.mm')
corpus[:5]
print(corpus)

MmCorpus(50951 documents, 13649 features, 447664 non-zero entries)


In [5]:
lda_params={'num_topics': 20, 'passes': 10, 'alpha': 0.1}

# train the lda model
lda = models.LdaModel(corpus, id2word=dictionary,
                    num_topics=lda_params['num_topics'],
                    passes=lda_params['passes'],
                    alpha = lda_params['alpha'])

lda.show_topics()

[(3,
  u'0.140*rich + 0.134*environment + 0.132*acronym + 0.025*trump + 0.023*many + 0.016*significant + 0.015*people + 0.014*actually + 0.013*protect + 0.013*suffer'),
 (15,
  u'0.100*back + 0.034*follow + 0.030*tweet + 0.020*movie + 0.019*today + 0.019*party + 0.019*tonight + 0.016*great + 0.016*release + 0.012*america'),
 (5,
  u'0.113*wait + 0.107*second + 0.036*hush + 0.023*look + 0.017*school + 0.017*play + 0.016*black + 0.015*remember + 0.015*terminal + 0.015*significant'),
 (0,
  u'0.043*top + 0.039*high + 0.022*yes + 0.017*mean + 0.014*hell + 0.013*come + 0.013*list + 0.013*parent + 0.013*face + 0.012*country'),
 (9,
  u'0.032*always + 0.023*sucha + 0.020*terminal + 0.017*really + 0.015*child + 0.014*stuff + 0.013*look + 0.012*wish + 0.012*way + 0.012*seem'),
 (19,
  u'0.045*life + 0.036*happy + 0.035*real + 0.029*guy + 0.028*free + 0.026*think + 0.022*watch + 0.021*thought + 0.015*youtube + 0.015*gift'),
 (8,
  u'0.036*significant + 0.028*knowledge + 0.027*laugh + 0.024*see +

In [6]:
pyLDAvis.gensim.prepare(lda, corpus, dictionary)