In [None]:
%matplotlib inline


Similarity Queries
==================

Demonstrates querying a corpus for similar documents.



In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Creating the Corpus
-------------------


In [2]:
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

2024-01-08 14:16:26,428 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-01-08 14:16:26,429 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2024-01-08 14:16:26,429 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2024-01-08T14:16:26.429829', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}


Similarity interface
--------------------

In [3]:
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

2024-01-08 14:16:59,416 : INFO : using serial LSI version on this node
2024-01-08 14:16:59,417 : INFO : updating model with new documents
2024-01-08 14:16:59,418 : INFO : preparing a new chunk of documents
2024-01-08 14:16:59,419 : INFO : using 100 extra samples and 2 power iterations
2024-01-08 14:16:59,419 : INFO : 1st phase: constructing (12, 102) action matrix
2024-01-08 14:16:59,421 : INFO : orthonormalizing (12, 102) action matrix
2024-01-08 14:16:59,424 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2024-01-08 14:16:59,425 : INFO : computing the final decomposition
2024-01-08 14:16:59,426 : INFO : keeping 2 factors (discarding 43.156% of energy spectrum)
2024-01-08 14:16:59,427 : INFO : processed documents up to #9
2024-01-08 14:16:59,428 : INFO : topic #0(3.341): 0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"
2024-01-08 14:16:59,428 : INFO : topic #1(2

- 本教程的目的只需了解有关 LSI 的两件事
- 首先，这只是另一种变换：它将向量从一个空间变换到另一个空间。
- 其次，LSI 的好处是能够识别术语（在我们的例子中是文档中的单词）和主题之间的模式和关系
- 我们的 LSI 空间是二维的（num_topics = 2），因此有两个主题

In [10]:
doc = "Human computer interaction"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 0.4618210045327148), (1, -0.07002766527899958)]


### Initializing query structures

In [7]:
from gensim import similarities
index = similarities.MatrixSimilarity(lsi[corpus])  # transform corpus to LSI space and index it
index

2024-01-08 14:31:12,928 : INFO : creating matrix with 9 documents and 2 features


<gensim.similarities.docsim.MatrixSimilarity at 0x1b62bf3d0>

<div class="alert alert-danger"><h4>Warning</h4><p>
仅当整个向量集适合内存时，类 similarities.MatrixSimilarity 才适用。例如，当与此类一起使用时，包含 100 万个文档的语料库将需要 256 维 LSI 空间中的 2GB RAM。
</p>
<p>
如果没有 2GB 的可用 RAM，您将需要使用 similarities.Similarity 类。此类在固定内存中运行，通过将索引拆分到磁盘上的多个文件（称为分片）中。它在内部使用 similarities.MatrixSimilarity 和 similarities.SparseMatrixSimilarity ，因此它仍然很快，尽管稍微复杂一些。
</p>
</div>

In [None]:
# 索引持久性是通过标准 save() 和 load() 函数处理的
# 对于所有相似性索引类（ similarities.Similarity 、 similarities.MatrixSimilarity 和 similarities.SparseMatrixSimilarity ）都是如此
index.save('/tmp/deerwester.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

### Performing queries

In [11]:
index[vec_lsi]

array([ 0.998093  ,  0.93748635,  0.9984453 ,  0.9865886 ,  0.90755945,
       -0.12416792, -0.10639259, -0.09879464,  0.05004176], dtype=float32)

In [19]:
sims = index[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print(document_number, document_similarity)  2-tuples

# 余弦测量返回 <-1, 1> 范围内的相似度（越大，越相似），因此第一个文档的得分为 0.998093

[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.9865886), (4, 0.90755945), (5, -0.12416792), (6, -0.10639259), (7, -0.09879464), (8, 0.050041765)]


In [22]:
# 将这些相似性按降序排列，并获得查询 “人机交互” 的最终答案
sorted_sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sorted_sims:
    print(doc_score, documents[doc_position])

0.9984453 The EPS user interface management system
0.998093 Human machine interface for lab abc computer applications
0.9865886 System and human system engineering testing of EPS
0.93748635 A survey of user opinion of computer system response time
0.90755945 Relation of user perceived response time to error measurement
0.050041765 Graph minors A survey
-0.09879464 Graph minors IV Widths of trees and well quasi ordering
-0.10639259 The intersection graph of paths in trees
-0.12416792 The generation of random binary unordered trees


如果使用 a standard boolean fulltext search 方法，那么 ``The EPS user interface management system`` 和 ``Relation of user perceived response time to error measurement`` 将不可能得分这么高，因为它俩与原文本 ``"Human computer interaction"`` 不共享任何常见单词。然而，在应用 LSI 后，我们可以观察到它们都获得了相当高的相似度分数。事实上，这种语义概括是我们首先应用转换和进行主题建模的原因。