In [1]:
%matplotlib inline


# Topics and Transformations

In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In this tutorial, I will show how to transform documents from one vector representation
into another. This process serves two goals:

1. To bring out hidden structure in the corpus, discover relationships between
   words and use them to describe the documents in a new and
   (hopefully) more semantic way.
2. To make the document representation more compact. This both improves efficiency
   (new representation consumes less resources) and efficacy (marginal data
   trends are ignored, noise-reduction).

## Creating the Corpus



In [2]:
from collections import defaultdict
from gensim import corpora

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

### Creating a transformation

In [3]:
# step 1 -- initialize a model
from gensim import models

tfidf = models.TfidfModel(corpus)  

### Transforming vectors

- tfidf 被视为只读对象，可用于将任何向量从旧表示（词袋整数计数）转换为新表示（TfIdf 实值权重）

- Once the transformation model has been initialized, it can be used on any vectors (provided they come from the same vector space, of course), even if they were not used in the training corpus at all. This is achieved by a process called folding-in for LSA, by topic inference for LDA etc.

In [4]:
# step 2 -- use the model to transform vectors
doc_bow = [(0, 1), (1, 1)]
print(tfidf[doc_bow])

[(0, 0.7071067811865476), (1, 0.7071067811865476)]


In [5]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)

# 语料库texts中的内容:
#   [['human', 'interface', 'computer'],
#    ['survey', 'user', 'computer', 'system', 'response', 'time'],
#    ['eps', 'user', 'interface', 'system'],
#    ['system', 'human', 'system', 'eps'],
#    ['user', 'response', 'time'],
#    ['trees'],
#    ['graph', 'trees'],
#    ['graph', 'minors', 'trees'],
#    ['graph', 'minors', 'survey']]

# 词向量 dictionary:
# 频率
#    00: 'computer': 2,
#    01: 'human': 2,
#    02: 'interface': 2,
#    03: 'response': 2,
#    04: 'survey': 2,
#    05: 'system': 4,
#    06: 'time': 2,
#    07: 'user': 3,
#    08: 'eps': 2,
#    09: 'trees': 3,
#    10: 'graph': 3,
#    11: 'minors': 2

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), (6, 0.44424552527467476), (7, 0.3244870206138555)]
[(2, 0.5710059809418182), (5, 0.4170757362022777), (7, 0.4170757362022777), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.7184811607083769), (8, 0.49182558987264147)]
[(3, 0.6282580468670046), (6, 0.6282580468670046), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.7071067811865475), (10, 0.7071067811865475)]
[(9, 0.5080429008916749), (10, 0.5080429008916749), (11, 0.695546419520037)]
[(4, 0.6282580468670046), (10, 0.45889394536615247), (11, 0.6282580468670046)]


### 注意

- 调用 model[corpus] 仅在旧的 corpus 文档流周围创建一个包装器 - 实际转换是在文档迭代期间即时完成的
- 我们无法在调用 corpus_transformed = model[corpus] 时转换整个语料库，因为这意味着将结果存储在主内存中，这与 gensim 内存独立的目标相矛盾
- 如果您想多次迭代转换 corpus_transformed (注意转换成本高昂)请首先将生成的语料库序列化到磁盘，然后继续使用它

#### 生成2个主题的LSI模型

In [10]:
# Transformations can also be serialized, one on top of another, in a sort of chain:
# 通过潜在语义索引将 Tf-Idf 语料库转换为潜在 2-D 空间(2-D because we set num_topics=2)
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)  # initialize an LSI transformation
corpus_lsi = lsi_model[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi

2023-12-29 19:50:13,912 : INFO : using serial LSI version on this node
2023-12-29 19:50:13,913 : INFO : updating model with new documents
2023-12-29 19:50:13,915 : INFO : preparing a new chunk of documents
2023-12-29 19:50:13,917 : INFO : using 100 extra samples and 2 power iterations
2023-12-29 19:50:13,918 : INFO : 1st phase: constructing (12, 102) action matrix
2023-12-29 19:50:13,922 : INFO : orthonormalizing (12, 102) action matrix
2023-12-29 19:50:13,930 : INFO : 2nd phase: running dense svd on (12, 9) matrix
2023-12-29 19:50:13,934 : INFO : computing the final decomposition
2023-12-29 19:50:13,936 : INFO : keeping 2 factors (discarding 47.565% of energy spectrum)
2023-12-29 19:50:13,937 : INFO : processed documents up to #9
2023-12-29 19:50:13,939 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2023-12-29 19:50:13,940 : INFO

In [11]:
# 语料库可以分为2个主题
# “trees”、“graph” 和 “minors” 都是相关词与主题1相关，所有其他词与主题2有关
# 正如预期的那样，前五个文档与主题2的相关性更强，而其余四个文档与主题1的相关性更强：
lsi_model.print_topics(2)

# 说明:
# 1. 值为正数说明是正相关，为负数说明是负相关
# 2. 值越大说明权重越高

2023-12-29 19:50:51,020 : INFO : topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
2023-12-29 19:50:51,022 : INFO : topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"


[(0,
  '-0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"time" + -0.060*"response" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"')]

In [12]:
# both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
for doc, as_text in zip(corpus_lsi, documents):
    print(doc, as_text)

[(0, -0.06600783396090276), (1, -0.5200703306361842)] Human machine interface for lab abc computer applications
[(0, -0.19667592859142433), (1, -0.7609563167700051)] A survey of user opinion of computer system response time
[(0, -0.08992639972446302), (1, -0.7241860626752507)] The EPS user interface management system
[(0, -0.07585847652178028), (1, -0.6320551586003427)] System and human system engineering testing of EPS
[(0, -0.10150299184980069), (1, -0.573730848300296)] Relation of user perceived response time to error measurement
[(0, -0.7032108939378308), (1, 0.1611518021402568)] The generation of random binary unordered trees
[(0, -0.8774787673119832), (1, 0.16758906864659276)] The intersection graph of paths in trees
[(0, -0.9098624686818582), (1, 0.14086553628718887)] Graph minors IV Widths of trees and well quasi ordering
[(0, -0.6165825350569287), (1, -0.05392907566389443)] Graph minors A survey


#### 生成4个主题的LSI模型

In [None]:
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=4)  # initialize an LSI transformation

In [17]:
lsi_model.print_topics(4)

[(0,
  '0.703*"trees" + 0.538*"graph" + 0.402*"minors" + 0.187*"survey" + 0.061*"system" + 0.060*"response" + 0.060*"time" + 0.058*"user" + 0.049*"computer" + 0.035*"interface"'),
 (1,
  '-0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"time" + -0.320*"response" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"'),
 (2,
  '0.456*"time" + 0.456*"response" + -0.352*"eps" + -0.340*"human" + -0.318*"interface" + -0.277*"system" + 0.272*"survey" + 0.213*"user" + -0.183*"trees" + 0.114*"minors"'),
 (3,
  '-0.583*"trees" + 0.556*"minors" + 0.399*"survey" + 0.256*"graph" + -0.211*"time" + -0.211*"response" + -0.160*"user" + 0.081*"human" + 0.038*"interface" + 0.035*"system"')]

In [25]:
# 计算一下原语料库在4个主题的相似度
corpus_lsi = lsi_model[corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
for doc in corpus_lsi:
    print(doc)

[(0, 0.06600783396090418), (1, -0.5200703306361848), (2, -0.37649581219168915), (3, 0.08615954148141182)]
[(0, 0.1966759285914244), (1, -0.7609563167700053), (2, 0.5080674581001665), (3, -0.03779696482160011)]
[(0, 0.08992639972446449), (1, -0.724186062675251), (2, -0.40898973155376395), (3, -0.015188930947954999)]
[(0, 0.07585847652178201), (1, -0.6320551586003428), (2, -0.5393533605733899), (3, 0.07827545681079975)]
[(0, 0.10150299184980077), (1, -0.5737308483002961), (2, 0.6709338585295913), (3, -0.33929517906883067)]
[(0, 0.7032108939378315), (1, 0.16115180214025768), (2, -0.18266089635241337), (3, -0.582595633708498)]
[(0, 0.8774787673119836), (1, 0.16758906864659381), (2, -0.10880822642632751), (3, -0.23079422499324245)]
[(0, 0.9098624686818579), (1, 0.14086553628718984), (2, 0.0008711787488690781), (3, 0.22107010276744518)]
[(0, 0.6165825350569278), (1, -0.053929075663894085), (2, 0.255686979595993), (3, 0.7179497748057158)]


In [18]:
corpus_lsi = lsi_model[corpus_tfidf]
for doc, as_text in zip(corpus_lsi, documents):
    print(doc, as_text)

[(0, 0.06600783396090418), (1, -0.5200703306361848), (2, -0.37649581219168915), (3, 0.08615954148141182)] Human machine interface for lab abc computer applications
[(0, 0.1966759285914244), (1, -0.7609563167700053), (2, 0.5080674581001665), (3, -0.03779696482160011)] A survey of user opinion of computer system response time
[(0, 0.08992639972446449), (1, -0.724186062675251), (2, -0.40898973155376395), (3, -0.015188930947954999)] The EPS user interface management system
[(0, 0.07585847652178201), (1, -0.6320551586003428), (2, -0.5393533605733899), (3, 0.07827545681079975)] System and human system engineering testing of EPS
[(0, 0.10150299184980077), (1, -0.5737308483002961), (2, 0.6709338585295913), (3, -0.33929517906883067)] Relation of user perceived response time to error measurement
[(0, 0.7032108939378315), (1, 0.16115180214025768), (2, -0.18266089635241337), (3, -0.582595633708498)] The generation of random binary unordered trees
[(0, 0.8774787673119836), (1, 0.16758906864659381),

# Model Save/Load
Model persistency is achieved with the :func:`save` and :func:`load` functions:



In [16]:
# 模型持久性是通过 save() 和 load() 函数实现的
import os
import tempfile

with tempfile.NamedTemporaryFile(prefix='model-', suffix='.lsi', delete=False) as tmp:
    lsi_model.save(tmp.name)  # same for tfidf, lda, ...

loaded_lsi_model = models.LsiModel.load(tmp.name)

os.unlink(tmp.name)

2023-12-29 20:09:46,134 : INFO : Projection lifecycle event {'fname_or_handle': '/var/folders/c5/6z8x0w0s2mv90f5wxjdl0x440000gn/T/model-9bzvfugl.lsi.projection', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-12-29T20:09:46.134964', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'saving'}
2023-12-29 20:09:46,136 : INFO : saved /var/folders/c5/6z8x0w0s2mv90f5wxjdl0x440000gn/T/model-9bzvfugl.lsi.projection
2023-12-29 20:09:46,137 : INFO : LsiModel lifecycle event {'fname_or_handle': '/var/folders/c5/6z8x0w0s2mv90f5wxjdl0x440000gn/T/model-9bzvfugl.lsi', 'separately': 'None', 'sep_limit': 10485760, 'ignore': ['projection', 'dispatcher'], 'datetime': '2023-12-29T20:09:46.137596', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64b