In [1]:
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [2]:
from gensim.models.ldamodel import LdaModel

In [3]:
from gensim import corpora

In [4]:
import pandas as pd

In [5]:
docs = pd.read_csv('/home/irlab/Documents/dataset.csv')

In [6]:
from nltk.tokenize import word_tokenize

In [7]:
corpus = []
corpus = [word_tokenize(doc) for doc in docs.Header]

In [8]:
from nltk.corpus import stopwords

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer

Preprocess Training document

In [10]:
lda_corpus = []
for docs in corpus:
    docs = [doc.lower() for doc in docs]
    docs = [doc for doc in docs if doc.isalpha()]
    docs = [doc for doc in docs if doc not in stopwords.words('english')]
    docs = [WordNetLemmatizer().lemmatize(doc) for doc in docs]
    #docs = [WordNetLemmatizer() for doc in docs]
    lda_corpus.append(docs)

Document After preprocessing

In [11]:
for doc in lda_corpus:
    print(doc)

['innovation', 'database', 'management', 'computer', 'science', 'engineering']
['high', 'performance', 'prime', 'field', 'multiplication', 'gpu']
['enchanted', 'scissors', 'scissor', 'interface', 'support', 'cutting', 'interactive', 'fabrication']
['detection', 'channel', 'degradation', 'attack', 'intermediary', 'node', 'linear', u'network']
['pinning', 'complex', 'network', 'betweenness', 'centrality', 'strategy']
['analysis', 'design', 'memoryless', 'interconnect', 'encoding', 'scheme']
['dynamic', 'bluescreens']
['quantitative', 'assured', 'forwarding', 'service']
['automatic', 'sanitization', 'social', 'network', 'data', 'prevent', 'inference', u'attack']
['radar', 'ranging', 'capability', 'human', 'body', 'monitoring', u'system']
['architecture', 'main', 'memory', 'system', 'gbps', 'operation']
['service', 'customization', 'via', 'houdini']
['business', 'policy', 'modeling', 'enforcement', u'database']
['high', 'speed', 'high', 'linearity', 'ota', 'power', 'supply', 'voltage']
['p

# Latent Dirichlet Allocation

Build dictionary of words to be used for analysis and convert it to bag of words

In [12]:
dictionary = Dictionary(lda_corpus)
corpus = [dictionary.doc2bow(doc) for doc in lda_corpus]

In [13]:
import pickle
pickle.dump(corpus,open('/home/irlab/Documents/corpus.pkl','wb'))
dictionary.save('/home/irlab/Documents/dictionary.gensim')

1. Train LDA model
2. Find top 10 topics
3. Topics will be represented in (topic_id,words_characterizing_topic) 

In [21]:
ldamodel = LdaModel(corpus,num_topics=10,id2word=dictionary,passes=15)
ldamodel.save('/home/irlab/Documents/ldamodel5.gensim')

In [22]:
topics = ldamodel.print_topics(num_words=5)

Print topic_id and words related to the topic

In [23]:
for topic in topics:
    print(topic)

(0, u'0.043*"query" + 0.022*"using" + 0.013*"synthesis" + 0.009*"noise" + 0.009*"device"')
(1, u'0.025*"system" + 0.021*"network" + 0.015*"power" + 0.014*"model" + 0.011*"amplifier"')
(2, u'0.021*"network" + 0.020*"data" + 0.015*"internet" + 0.013*"dynamic" + 0.012*"modeling"')
(3, u'0.051*"web" + 0.013*"search" + 0.013*"semantic" + 0.012*"classification" + 0.012*"automatic"')
(4, u'0.016*"approach" + 0.015*"method" + 0.015*"based" + 0.014*"web" + 0.010*"environment"')
(5, u'0.025*"algorithm" + 0.018*"using" + 0.012*"space" + 0.010*"object" + 0.009*"interaction"')
(6, u'0.017*"database" + 0.015*"based" + 0.013*"web" + 0.013*"ad" + 0.012*"system"')
(7, u'0.074*"network" + 0.039*"sensor" + 0.032*"wireless" + 0.023*"cmos" + 0.014*"mobile"')
(8, u'0.033*"design" + 0.017*"filter" + 0.016*"system" + 0.015*"implementation" + 0.012*"linear"')
(9, u'0.067*"data" + 0.028*"system" + 0.016*"management" + 0.013*"database" + 0.013*"network"')


Test data for testing LDA model

In [24]:
lda_test_corpus = []
test_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
test_corpus = word_tokenize(test_doc)
test_corpus = [doc.lower() for doc in test_corpus]
test_corpus = [doc for doc in test_corpus if doc.isalpha()]
test_corpus = [doc for doc in test_corpus if doc not in stopwords.words('english')]
test_corpus = [WordNetLemmatizer().lemmatize(doc) for doc in test_corpus]
lda_test_corpus.append(test_corpus)

1. Print the bow of the test doc
2. Show affinity of the doc to a topic

In [25]:
test_corpus = [dictionary.doc2bow(doc) for doc in lda_test_corpus]
test_corpus = test_corpus[0]
print(test_corpus)
print(ldamodel.get_document_topics(test_corpus))

[(133, 1), (248, 1), (470, 1), (617, 1), (2000, 1)]
[(0, 0.016666668), (1, 0.016666668), (2, 0.20270884), (3, 0.016667383), (4, 0.01666854), (5, 0.016671354), (6, 0.23667616), (7, 0.016671246), (8, 0.44393015), (9, 0.01667303)]


# Visualization of the topics

In [27]:
import pyLDAvis.gensim

1. Saved model in ldamodel5.gensim
2. Saved dictionary in dictionary.gensim
3. Saved corpus in corpus.pkl

In [30]:
v_dictionary = dictionary.load('/home/irlab/Documents/dictionary.gensim')
v_corpus = pickle.load(open('/home/irlab/Documents/corpus.pkl'))
v_lda = ldamodel.load('/home/irlab/Documents/ldamodel5.gensim')

In [32]:
lda_display = pyLDAvis.gensim.prepare(v_lda,v_corpus,v_dictionary,s)
pyLDAvis.display(lda_display)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))
