In [1]:
%matplotlib inline

In [2]:
import pprint

In [3]:
text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [4]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in text_corpus]
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

In [5]:
# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
pprint.pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [37]:
{k:v for k, v in frequency.items() if v>1}

{'human': 2,
 'interface': 2,
 'computer': 2,
 'survey': 2,
 'user': 3,
 'system': 4,
 'response': 2,
 'time': 2,
 'eps': 2,
 'trees': 3,
 'graph': 3,
 'minors': 2}

In [6]:
# Before proceeding, we want to associate each word in the corpus with a unique integer ID. 
# This dictionary defines the vocabulary of all words that our processing knows about.
from gensim import corpora

dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


In [18]:
pprint.pprint(dictionary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


In [25]:
dictionary.dfs

{1: 2, 2: 2, 0: 2, 4: 2, 7: 3, 5: 3, 3: 2, 6: 2, 8: 2, 9: 3, 10: 3, 11: 2}

In [13]:
sorted_words = sorted(dictionary.token2id.items(), key=lambda x: x[1])
pprint.pprint(sorted_words)

[('computer', 0),
 ('human', 1),
 ('interface', 2),
 ('response', 3),
 ('survey', 4),
 ('system', 5),
 ('time', 6),
 ('user', 7),
 ('eps', 8),
 ('trees', 9),
 ('graph', 10),
 ('minors', 11)]


In [7]:
# 使用 doc2bow 方法为文档创建词袋表示，该方法返回单词计数的稀疏表示
# 未出现在矢量化中的单词将隐式表示为零(这儿就是 interaction 不在dictionary，所以隐式为0；又因为稀疏所以直接不显示)
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [8]:
# 将整个原始语料库转换为向量列表
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


In [9]:
# 使用模型 tf-idf
# tf-idf 模型将向量从词袋表示转换为向量空间，其中频率计数根据语料库中每个单词的相对稀有度进行加权。
from gensim import models

# train the model
# 对于 TfIdf，“训练” 只需浏览一次提供的语料库并计算其所有特征的文档频率
# 训练其他模型，例如潜在语义分析或潜在狄利克雷分配，涉及更多，因此需要更多时间
tfidf = models.TfidfModel(bow_corpus)

# transform the "system minors" string
words = "system minors".lower().split()
print(tfidf[dictionary.doc2bow(words)])

# 说明
# ID corresponding to “system” has been weighted lower than the ID corresponding to “minors”
# Because:
#   (“system” occurred 4 times in the original corpus)
#   (“minors” only occurred twice)

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


In [10]:
# 可以通过 TfIdf 转换整个语料库并对其进行索引，为相似性查询做准备
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)
index

<gensim.similarities.docsim.SparseMatrixSimilarity at 0x11cc419f0>

In [19]:
# 查询 query_document 与语料库中每个文档的相似度
query_document = 'system engineering'.split()
query_bow = dictionary.doc2bow(query_document)  # query_bow = [(5, 1)]  # 说明: system出现一次, engineering没有
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))    # 可以知道 'system engineering' 与如下语料库中的第4句最相似

# 语料库内容
# text_corpus = [
#     "Human machine interface for lab abc computer applications",
#     "A survey of user opinion of computer system response time",
#     "The EPS user interface management system",
#     "System and human system engineering testing of EPS",
#     "Relation of user perceived response time to error measurement",
#     "The generation of random binary unordered trees",
#     "The intersection graph of paths in trees",
#     "Graph minors IV Widths of trees and well quasi ordering",
#     "Graph minors A survey",
# ]

[(0, 0.0), (1, 0.32448703), (2, 0.41707572), (3, 0.7184812), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [17]:
# 通过排序使其更具可读性
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
