# Topics and Transformations

In [1]:
import os
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim import corpora, models, similarities
if (os.path.exists("/tmp/deerwester.dict")):
    dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
    corpus = corpora.MmCorpus('/tmp/deerwester.mm')
    print("Used files generated from first tutorial")
else:
    print("Please run first tutorial to generate data set")

Used files generated from first tutorial


In [3]:
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model

In [4]:
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [5]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(1, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.44424552527467476), (6, 0.3244870206138555), (7, 0.3244870206138555)]
[(0, 0.5710059809418182), (6, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(2, 0.49182558987264147), (6, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (4, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(5, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


In [6]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

In [7]:
lsi.print_topics(2)

[(0,
  u'0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"time" + 0.060*"response" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  u'-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [8]:
for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    print(doc)

[(0, 0.066007833960903678), (1, -0.52007033063618491)]
[(0, 0.19667592859142555), (1, -0.76095631677000453)]
[(0, 0.08992639972446366), (1, -0.72418606267525076)]
[(0, 0.075858476521780932), (1, -0.63205515860034289)]
[(0, 0.10150299184980174), (1, -0.57373084830029553)]
[(0, 0.70321089393783076), (1, 0.16115180214025748)]
[(0, 0.87747876731198282), (1, 0.16758906864659379)]
[(0, 0.90986246868185738), (1, 0.14086553628718995)]
[(0, 0.61658253505692806), (1, -0.053929075663893648)]


In [9]:
lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
lsi = models.LsiModel.load('/tmp/model.lsi')

In [14]:
x = [doc[0][1] for doc in corpus_lsi]
x

[0.066007833960903678,
 0.19667592859142555,
 0.08992639972446366,
 0.075858476521780932,
 0.10150299184980174,
 0.70321089393783076,
 0.87747876731198282,
 0.90986246868185738,
 0.61658253505692806]

In [13]:
y = [doc[1][1] for doc in corpus_lsi]
y

[-0.52007033063618491,
 -0.76095631677000453,
 -0.72418606267525076,
 -0.63205515860034289,
 -0.57373084830029553,
 0.16115180214025748,
 0.16758906864659379,
 0.14086553628718995,
 -0.053929075663893648]

In [17]:
import matplotlib.pyplot as plt
plt.plot(x, y, 'ro')
plt.axis([0, 1, -1, 1])
plt.show()