In [None]:
%matplotlib inline

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# From Strings to Vectors

In [2]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [3]:
from pprint import pprint  # pretty-printer
from collections import defaultdict

# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [4]:
from gensim import corpora
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
print(dictionary)

2023-12-29 17:37:06,691 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-12-29 17:37:06,692 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2023-12-29 17:37:06,692 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2023-12-29T17:37:06.692632', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}
2023-12-29 17:37:06,693 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-12-29T17:37:06.693409', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07

Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


In [5]:
print(dictionary.token2id)

{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [6]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)  # the word "interaction" does not appear in the dictionary and is ignored

[(0, 1), (1, 1)]


In [7]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)  # store to disk, for later use
print(corpus)

2023-12-29 17:41:10,650 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2023-12-29 17:41:10,651 : INFO : saving sparse matrix to /tmp/deerwester.mm
2023-12-29 17:41:10,652 : INFO : PROGRESS: saving document #0
2023-12-29 17:41:10,653 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2023-12-29 17:41:10,654 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]


# Corpus Streaming – One Document at a Time

In [8]:
from smart_open import open  # for transparently opening remote files


class MyCorpus:
    def __iter__(self):
        for line in open('https://radimrehurek.com/mycorpus.txt'):
            # assume there's one document per line, tokens separated by whitespace
            yield dictionary.doc2bow(line.lower().split())

In [None]:
# This flexibility allows you to create your own corpus classes that stream the documents directly from disk, network, database, dataframes... 
# The models in Gensim are implemented such that they don't require all vectors to reside in RAM at once.
# You can even create the documents on the fly!

In [9]:
corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
print(corpus_memory_friendly)

<__main__.MyCorpus object at 0x1a3a01300>


In [10]:
for vector in corpus_memory_friendly:  # load one vector into memory at a time
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]


In [11]:
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/mycorpus.txt'))
# remove stop words and words that appear only once
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in stoplist
    if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
dictionary.compactify()  # remove gaps in id sequence after words that were removed
print(dictionary)

2023-12-29 17:50:02,665 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2023-12-29 17:50:02,666 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)
2023-12-29 17:50:02,666 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)", 'datetime': '2023-12-29T17:50:02.666799', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}


Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>


In [13]:
dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}

In [16]:
dictionary.dfs

{1: 2, 2: 2, 0: 2, 4: 2, 7: 3, 5: 3, 3: 2, 6: 2, 8: 2, 9: 3, 10: 3, 11: 2}

## Corpus Formats

In [17]:
corpus = [[(1, 0.5)], []]  # make one document empty, for the heck of it

corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)

2023-12-29 18:02:29,056 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
2023-12-29 18:02:29,058 : INFO : saving sparse matrix to /tmp/corpus.mm
2023-12-29 18:02:29,058 : INFO : PROGRESS: saving document #0
2023-12-29 18:02:29,059 : INFO : saved 2x2 matrix, density=25.000% (1/4)
2023-12-29 18:02:29,060 : INFO : saving MmCorpus index to /tmp/corpus.mm.index


In [19]:
# 其他格式包括 Joachim 的 SVMlight 格式、Blei 的 LDA-C 格式和 GibbsLDA++ 格式
corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)

2023-12-29 18:08:17,785 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
2023-12-29 18:08:17,787 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
2023-12-29 18:08:17,788 : INFO : no word id mapping provided; initializing from corpus
2023-12-29 18:08:17,790 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2023-12-29 18:08:17,792 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2023-12-29 18:08:17,793 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
2023-12-29 18:08:17,794 : INFO : no word id mapping provided; initializing from corpus
2023-12-29 18:08:17,795 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
2023-12-29 18:08:17,797 : INFO : saving LowCorpus index to /tmp/corpus.low.index


In [18]:
corpus = corpora.MmCorpus('/tmp/corpus.mm')

2023-12-29 18:08:14,580 : INFO : loaded corpus index from /tmp/corpus.mm.index
2023-12-29 18:08:14,581 : INFO : initializing cython corpus reader from /tmp/corpus.mm
2023-12-29 18:08:14,583 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries


In [22]:
# Corpus objects are streams, so typically you won’t be able to print them directly
print(corpus)

MmCorpus(2 documents, 2 features, 1 non-zero entries)


In [23]:
# one way of printing a corpus: load it entirely into memory
print(list(corpus))  # calling list() will convert any sequence to a plain Python list

[[(1, 0.5)], []]


or



In [24]:
# another way of doing it: print one document at a time, making use of the streaming interface
for doc in corpus:
    print(doc)

[(1, 0.5)]
[]


In [None]:
corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)

## Compatibility with NumPy and SciPy
#### NumPy示例
- Gensim also contains [efficient utility functions](https://radimrehurek.com/gensim/matutils.html) to help converting from/to numpy matrices


In [30]:
import gensim
import numpy as np
numpy_matrix = np.random.randint(10, size=[5, 2])  # random matrix as an example(创建一个矩阵)
corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
# numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)

In [32]:
print(numpy_matrix)
print(list(corpus))

[[1 2]
 [8 3]
 [9 9]
 [0 2]
 [1 9]]
[[(0, 1.0), (1, 8.0), (2, 9.0), (4, 1.0)], [(0, 2.0), (1, 3.0), (2, 9.0), (3, 2.0), (4, 9.0)]]


#### SciPy示例

In [3]:
import gensim
import scipy.sparse
scipy_sparse_matrix = scipy.sparse.random(5, 2)  # random sparse matrix as example(创建一个稀疏矩阵)
corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)

In [4]:
print(scipy_sparse_matrix)
print(list(corpus))


[[], []]


In [12]:
import scipy.sparse
# density：指定非零元素的密度
# format：指定返回的矩阵的格式，这里选择压缩稀疏行（CSR）格式
# random_state：为了可重复性，可以设置随机数生成器的种子
sparse_matrix=scipy.sparse.random(5, 2, density=0.25, format='csr', random_state=42)
print("稀疏矩阵的内容：")
print(sparse_matrix)
dense_matrix = sparse_matrix.toarray()
print("密集矩阵的内容：")
print(dense_matrix)


  (1, 0)	0.05808361216819946
  (3, 1)	0.15599452033620265
密集矩阵的内容：
[[0.         0.        ]
 [0.05808361 0.        ]
 [0.         0.        ]
 [0.         0.15599452]
 [0.         0.        ]]
