# Similarity Queries

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
corpus = corpora.MmCorpus('/tmp/deerwester.mm') # comes from the first tutorial, "From strings to vectors"
print(corpus)

MmCorpus(9 documents, 12 features, 28 non-zero entries)


convert to 2-D LSI space

In [3]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

Now search for "Human computer interaction"

In [4]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
print(vec_bow)
vec_lsi = lsi[vec_bow] # convert the query to LSI space
print(vec_lsi)

[(1, 1), (2, 1)]
[(0, 0.46182100453271535), (1, 0.070027665278999993)]


In [12]:
x = [doc[0][1] for doc in corpus_lsi]
y = [doc[1][1] for doc in corpus_lsi]

In [11]:
import matplotlib.pyplot as plt
plt.plot(x, y, 'bo')
plt.plot([vec_lsi[0][1]], [vec_lsi[1][1]], 'ro')
plt.axis([-3, 3, -3, 3])
plt.show()

In [8]:
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

In [9]:
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

query

In [10]:
sims = index[vec_lsi] # perform a similarity query against the corpus
print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

[(0, 0.99809301), (1, 0.93748635), (2, 0.99844527), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.098794632), (8, 0.050041769)]


In [13]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims)

[(2, 0.99844527), (0, 0.99809301), (3, 0.9865886), (1, 0.93748635), (4, 0.90755945), (8, 0.050041769), (7, -0.098794632), (6, -0.10639259), (5, -0.12416792)]
