In [12]:
from gensim import corpora, models, similarities

from pprint import pprint
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
dictionary = corpora.Dictionary.load('../../data/deerwester.dict')
corpus = corpora.MmCorpus('../../data/deerwester.mm')
print(corpus)

2018-06-23 16:08:07,564 : INFO : loading Dictionary object from ../data/deerwester.dict
2018-06-23 16:08:07,568 : INFO : loaded ../data/deerwester.dict
2018-06-23 16:08:07,572 : INFO : loaded corpus index from ../data/deerwester.mm.index
2018-06-23 16:08:07,573 : INFO : initializing cython corpus reader from ../data/deerwester.mm
2018-06-23 16:08:07,576 : INFO : accepted corpus with 9 documents, 12 features, 28 non-zero entries


MmCorpus(9 documents, 12 features, 28 non-zero entries)


In [3]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2018-06-23 16:08:07,587 : INFO : using serial LSI version on this node
2018-06-23 16:08:07,591 : INFO : updating model with new documents
2018-06-23 16:08:07,594 : INFO : preparing a new chunk of documents
2018-06-23 16:08:07,597 : INFO : using 100 extra samples and 2 power iterations
2018-06-23 16:08:07,599 : INFO : 1st phase: constructing (12, 102) action matrix
2018-06-23 16:08:07,601 : INFO : orthonormalizing (12, 102) action matrix
2018-06-23 16:08:07,606 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2018-06-23 16:08:07,609 : INFO : computing the final decomposition
2018-06-23 16:08:07,611 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2018-06-23 16:08:07,613 : INFO : processed documents up to #9
2018-06-23 16:08:07,615 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2018-06-23 16:08:07,617 : INFO : topic #1(2

### Similarities
Given a user who types in the doc.. How can you find the most similar documents?

In [5]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)

[(0, 0.461821004532716), (1, 0.07002766527900026)]


In [6]:
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and transform it

2018-06-23 16:10:52,148 : INFO : creating matrix with 9 documents and 2 features


In [7]:
index.save('../../data/deerwester.index')
index = similarities.MatrixSimilarity.load('../../data/deerwester.index')

2018-06-23 16:11:57,122 : INFO : saving MatrixSimilarity object under ../data/deerwester.index, separately None
2018-06-23 16:11:57,125 : INFO : saved ../data/deerwester.index
2018-06-23 16:11:57,126 : INFO : loading MatrixSimilarity object from ../data/deerwester.index
2018-06-23 16:11:57,127 : INFO : loaded ../data/deerwester.index


### Gather the most similar docs compared to the vec_lsi
Note:
This is a cosine similarity. `-1 <= x <= 1`, greater is the more similar

In [15]:
sims = index[vec_lsi]
print(sims)

[ 0.998093    0.93748635  0.9984453   0.9865886   0.90755945 -0.12416792
 -0.10639259 -0.09879463  0.05004177]


In [16]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
pprint(sims)

[(2, 0.9984453),
 (0, 0.998093),
 (3, 0.9865886),
 (1, 0.93748635),
 (4, 0.90755945),
 (8, 0.05004177),
 (7, -0.09879463),
 (6, -0.10639259),
 (5, -0.12416792)]
