# 核心概念

1. document
2. corpus
3. vector
4. model

## document

In [14]:
document = "Human machine interface for lab abc computer applications"

## corpus

In [15]:
text_corpus = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

### stop words filter

In [16]:
# create a set of frequent words
stoplist = set("for a of the and to in".split(" "))
stoplist

{'a', 'and', 'for', 'in', 'of', 'the', 'to'}

In [17]:
# lowercase each document, split it by whitespace and filter out stopwords
texts = [
    [
        word for word in document.lower().split() 
        if word not in stoplist
    ]
    for document in text_corpus
]
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

### word frequency count

In [18]:
# count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
frequency

defaultdict(int,
            {'human': 2,
             'machine': 1,
             'interface': 2,
             'lab': 1,
             'abc': 1,
             'computer': 2,
             'applications': 1,
             'survey': 2,
             'user': 3,
             'opinion': 1,
             'system': 4,
             'response': 2,
             'time': 2,
             'eps': 2,
             'management': 1,
             'engineering': 1,
             'testing': 1,
             'relation': 1,
             'perceived': 1,
             'error': 1,
             'measurement': 1,
             'generation': 1,
             'random': 1,
             'binary': 1,
             'unordered': 1,
             'trees': 3,
             'intersection': 1,
             'graph': 3,
             'paths': 1,
             'minors': 2,
             'iv': 1,
             'widths': 1,
             'well': 1,
             'quasi': 1,
             'ordering': 1})

### word frequency filter

In [19]:
# only keep words that appear more than once
processed_corpus = [
    [
        token for token in text 
        if frequency[token] > 1
    ]
    for text in texts
]
processed_corpus

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### dictionary

In [45]:
from gensim import corpora
dictionary = corpora.Dictionary(processed_corpus)
# dictionary.save("./saved_results/deerwester.dict")
print(dictionary)

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


## vector

#### tokenizer

In [31]:
import pprint
pprint.pprint(dictonary.token2id)

{'computer': 0,
 'eps': 8,
 'graph': 10,
 'human': 1,
 'interface': 2,
 'minors': 11,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'trees': 9,
 'user': 7}


#### bag-of-word

In [35]:
new_doc = "Human computer interaction".lower().split()
print(new_doc)

new_vec = dictionary.doc2bow(new_doc)
print(new_vec)

['human', 'computer', 'interaction']
[(0, 1), (1, 1)]


In [34]:
from gensim import corpora
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

# corpora.MmCorpus.serialize("./saved_results/deerwester.mm", bow_corpus)
# corpora.SvmLightCorpus.serialize("./saved_results/deerwester.svmlight", bow_corpus)
# corpora.BleiCorpus.serialize("./saved_results/deerwester.lda-c", bow_corpus)
# corpora.LowCorpus.serialize("./saved_results/deerwester.low", bow_corpus)

pprint.pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]


## corpus 和 vector Streaming

In [48]:
from smart_open import open

# 语料
class MyCorpus:
    def __iter__(self):
        for line in open("https://radimrehurek.com/mycorpus.txt"):
            yield dictionary.doc2bow(line.lower().split())

corpus_memory_friendly = MyCorpus()
print(corpus_memory_friendly)
for vector in corpus_memory_friendly:
    print(vector)


# 字典
dictionary = corpora.Dictionary(
    line.lower().split() 
    for line in open("https://radimrehurek.com/mycorpus.txt")
)
# 停止词
stoplist = set("for a of the and to in".split(" "))
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]
# 频率统计
once_ids = [
    token_id 
    for token_id, doc_freq in dictionary.dfs.items() 
    if doc_freq == 1
]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.compactify()
print(dictionary)

<__main__.MyCorpus object at 0x30101de90>
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


## model

### TF-IDF

In [36]:
from gensim import models

# train model
tf_idf = models.TfidfModel(bow_corpus)

In [37]:
# transform the 'system minors' string
words = "system minors".lower().split()
print(words)

['system', 'minors']


In [38]:
res = tf_idf[dictionary.doc2bow(words)]
print(res)

[(5, 0.5898341626740045), (11, 0.8075244024440723)]


### 相似性查询

In [44]:
from gensim import similarities
index = similarities.SparseMatrixSimilarity(tf_idf[bow_corpus], num_features=12)
print(index)

query_document = "system engineering".lower().split()
print(query_document)

query_bow = dictionary.doc2bow(query_document)
print(query_bow)

sims = index[tf_idf[query_bow]]
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

<gensim.similarities.docsim.SparseMatrixSimilarity object at 0x300a49710>
['system', 'engineering']
[(5, 1)]
3 0.7184812
2 0.41707572
1 0.32448703
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


# Word2Vec

In [3]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300")



ContentTooShortError: <urlopen error retrieval incomplete: got only 524288000 out of 1743563840 bytes>

In [None]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

In [None]:
vec_king = wv["king"]
vec_king

In [None]:
try:
    vec_cameroon = wv["cameroon"]
except KeyError:
    print(f"The word 'cameroon' does not appear in this model")

In [None]:
# Word2Vec 内置支持多种词语相似性任务
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

In [None]:
# 打印与“car”或“minivan”最相似的 5 个词
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

In [None]:
# 以下哪个不属于这个序列？
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

# FastText