In [1]:
import os
import logging

from gensim import corpora, models, similarities

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
if (os.path.exists('../../data/deerwester.dict')):
    dictionary = corpora.Dictionary.load('../../data/deerwester.dict')
    corpus = corpora.MmCorpus('../../data/deerwester.mm')
    print('Used files from first tutorial')
else:
    print('Please go through first tutorial to create initial dataset')

2018-06-23 12:48:46,328 : INFO : loading Dictionary object from ../data/deerwester.dict
2018-06-23 12:48:46,331 : INFO : loaded ../data/deerwester.dict
2018-06-23 12:48:46,334 : INFO : loaded corpus index from ../data/deerwester.mm.index
2018-06-23 12:48:46,334 : INFO : initializing cython corpus reader from ../data/deerwester.mm
2018-06-23 12:48:46,338 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


Used files from first tutorial


In [3]:
tfidf = models.TfidfModel(corpus)

2018-06-23 12:48:46,348 : INFO : collecting document frequencies
2018-06-23 12:48:46,350 : INFO : PROGRESS: processing document #0
2018-06-23 12:48:46,354 : INFO : calculating IDF weights for 9 documents and 11 features (28 matrix non-zeros)


In [4]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


### Transformation
Generally, the model wont transform everything on `model[corpus]`. Instead, it is a generator wrapper that yields the results on iteration

In [5]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


### Chaining Transformations
Transformations can be serialized on top of one another as a chain. 

In [6]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
corpus_lsi = lsi[corpus_tfidf]

2018-06-23 12:48:46,385 : INFO : using serial LSI version on this node
2018-06-23 12:48:46,388 : INFO : updating model with new documents
2018-06-23 12:48:46,390 : INFO : preparing a new chunk of documents
2018-06-23 12:48:46,393 : INFO : using 100 extra samples and 2 power iterations
2018-06-23 12:48:46,394 : INFO : 1st phase: constructing (12, 102) action matrix
2018-06-23 12:48:46,397 : INFO : orthonormalizing (12, 102) action matrix
2018-06-23 12:48:46,401 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-06-23 12:48:46,403 : INFO : computing the final decomposition
2018-06-23 12:48:46,406 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2018-06-23 12:48:46,408 : INFO : processed documents up to #9
2018-06-23 12:48:46,410 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2018-06-23 12:48:46,411 : INFO : topic #

In [7]:
lsi.print_topics(2)

2018-06-23 12:48:46,427 : INFO : topic #0(1.594): 0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"
2018-06-23 12:48:46,430 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"


[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [8]:
for doc in corpus_lsi:
    print(doc)

[(0, 0.0660078339609041), (1, -0.5200703306361846)]
[(0, 0.19667592859142632), (1, -0.7609563167700043)]
[(0, 0.08992639972446576), (1, -0.7241860626752504)]
[(0, 0.07585847652178265), (1, -0.6320551586003423)]
[(0, 0.10150299184980227), (1, -0.5737308483002952)]
[(0, 0.7032108939378308), (1, 0.16115180214025907)]
[(0, 0.8774787673119828), (1, 0.16758906864659556)]
[(0, 0.9098624686818574), (1, 0.14086553628719167)]
[(0, 0.6165825350569281), (1, -0.053929075663892684)]


In [9]:
lsi.save('../../data/model.lsi') 
lsi = models.LsiModel.load('../../data/model.lsi')

2018-06-23 12:48:46,499 : INFO : saving Projection object under ../data/model.lsi.projection, separately None
2018-06-23 12:48:46,503 : INFO : saved ../data/model.lsi.projection
2018-06-23 12:48:46,506 : INFO : saving LsiModel object under ../data/model.lsi, separately None
2018-06-23 12:48:46,508 : INFO : not storing attribute projection
2018-06-23 12:48:46,510 : INFO : not storing attribute dispatcher
2018-06-23 12:48:46,512 : INFO : saved ../data/model.lsi
2018-06-23 12:48:46,514 : INFO : loading LsiModel object from ../data/model.lsi
2018-06-23 12:48:46,517 : INFO : loading id2word recursively from ../data/model.lsi.id2word.* with mmap=None
2018-06-23 12:48:46,519 : INFO : setting ignored attribute projection to None
2018-06-23 12:48:46,521 : INFO : setting ignored attribute dispatcher to None
2018-06-23 12:48:46,523 : INFO : loaded ../data/model.lsi
2018-06-23 12:48:46,526 : INFO : loading LsiModel object from ../data/model.lsi.projection
2018-06-23 12:48:46,528 : INFO : loaded ..

In [10]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=100)
corpus_lda = lda[corpus]

for doc in corpus_lda:
    print(doc)

2018-06-23 12:48:46,541 : INFO : using symmetric alpha at 0.01
2018-06-23 12:48:46,544 : INFO : using symmetric eta at 0.01
2018-06-23 12:48:46,548 : INFO : using serial LDA version on this node
2018-06-23 12:48:46,551 : INFO : running online (single-pass) LDA training, 100 topics, 1 passes over the supplied corpus of 9 documents, updating model once every 9 documents, evaluating perplexity every 9 documents, iterating 50x with a convergence threshold of 0.001000
2018-06-23 12:48:46,562 : INFO : -124.615 per-word bound, 32566585683365025254941702810102661120.0 perplexity estimate based on a held-out corpus of 9 documents with 29 words
2018-06-23 12:48:46,563 : INFO : PROGRESS: pass 0, at document #9/9
2018-06-23 12:48:46,570 : INFO : topic #80 (0.010): 0.083*"user" + 0.083*"system" + 0.083*"graph" + 0.083*"trees" + 0.083*"eps" + 0.083*"computer" + 0.083*"time" + 0.083*"interface" + 0.083*"response" + 0.083*"human"
2018-06-23 12:48:46,572 : INFO : topic #75 (0.010): 0.083*"user" + 0.083

[(60, 0.7525)]
[(8, 0.85857147)]
[(73, 0.80200005)]
[(73, 0.80200005)]
[(69, 0.7525)]
[(2, 0.505)]
[(92, 0.67)]
[(55, 0.7525)]
[(93, 0.7525)]


In [11]:
hdp = models.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]

for doc in corpus_hdp:
    print(doc)

2018-06-23 12:48:46,658 : INFO : (0, '0.244*interface + 0.181*trees + 0.147*user + 0.079*system + 0.066*survey + 0.060*computer + 0.054*time + 0.045*response + 0.042*graph + 0.041*human')
2018-06-23 12:48:46,664 : INFO : (1, '0.539*human + 0.131*computer + 0.065*user + 0.051*system + 0.051*graph + 0.050*trees + 0.042*survey + 0.026*eps + 0.016*time + 0.015*minors')
2018-06-23 12:48:46,666 : INFO : (2, '0.202*minors + 0.144*graph + 0.136*user + 0.112*time + 0.096*survey + 0.091*eps + 0.081*response + 0.049*trees + 0.043*human + 0.032*system')
2018-06-23 12:48:46,667 : INFO : (3, '0.331*response + 0.190*minors + 0.115*user + 0.079*interface + 0.073*system + 0.056*time + 0.051*survey + 0.044*graph + 0.034*computer + 0.015*trees')
2018-06-23 12:48:46,669 : INFO : (4, '0.202*graph + 0.188*system + 0.172*survey + 0.143*interface + 0.100*computer + 0.053*response + 0.044*time + 0.033*trees + 0.025*eps + 0.021*user')
2018-06-23 12:48:46,671 : INFO : (5, '0.206*interface + 0.154*time + 0.144*us

[(0, 0.3708519919537779), (1, 0.4884949698748099), (2, 0.035486426129223016), (3, 0.0265292697860892), (4, 0.019880416242032052), (5, 0.014943835776605754), (6, 0.011211220511232587)]
[(0, 0.8922968668186962), (1, 0.027221409126523554), (2, 0.02037429719776326), (3, 0.015171615527825266), (4, 0.011360423801781979)]
[(0, 0.849480804333551), (1, 0.0378882171609596), (2, 0.02849628147093631), (3, 0.021224801370406252), (4, 0.01590435739430635), (5, 0.011955066052224336)]
[(0, 0.05588048362490624), (1, 0.8314413298630288), (2, 0.028540219078447546), (3, 0.02122751826631778), (4, 0.015904912858907168), (5, 0.011955064060721287)]
[(0, 0.8118508868456358), (1, 0.047186784568816385), (2, 0.035756120838126154), (3, 0.026568944838739884), (4, 0.01988032983961449), (5, 0.014943842801397777), (6, 0.011211220541425352)]
[(0, 0.6241524518854794), (1, 0.0944785315487649), (2, 0.07104544730168587), (3, 0.05304966763682866), (4, 0.039760055937439254), (5, 0.029887665241162632), (6, 0.022422440996214387