In [None]:
%matplotlib inline

In [9]:
pip install testfixtures

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting testfixtures
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/eb/ee/21b092192a5c3c01ca8e4bf4afdacc6cd1be2da7bc0a37bffa59212b2843/testfixtures-7.2.2-py3-none-any.whl (103 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.1/103.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: testfixtures
Successfully installed testfixtures-7.2.2
Note: you may need to restart the kernel to use updated packages.



How to reproduce the doc2vec 'Paragraph Vector' paper
=====================================================

- Shows how to reproduce results of the "Distributed Representation of Sentences and Documents" paper by Le and Mikolov using Gensim.
- 脚本总运行时间：（52 分 12.903 秒）
- 估计内存使用量：3494 MB


In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
import collections

SentimentDocument = collections.namedtuple('SentimentDocument', 'words tags split sentiment')
# words: 电影评论的内容
# tags: 唯一标识(第几条评论)
# split: 数据集(test, train)
# sentiment: 观点(积极、消极、其他)

- 下载数据集并把需求解压相关文件
- 之后建立一个为上面 SentimentDocument 结构的命名tuple

In [4]:
import io
import re
import tarfile
import os.path

import smart_open
import gensim.utils

def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'):
    fname = url.split('/')[-1]

    if os.path.isfile(fname):
       return fname

    # Download the file to local storage first.
    with smart_open.open(url, "rb") as fin:
        with smart_open.open(fname, 'wb') as fout:
            while True:
                buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                if not buf:
                    break
                fout.write(buf)

    return fname

def create_sentiment_document(name, text, index):
    _, split, sentiment_str, _ = name.split('/')
    sentiment = {'pos': 1.0, 'neg': 0.0, 'unsup': None}[sentiment_str]

    if sentiment is None:
        split = 'extra'

    tokens = gensim.utils.to_unicode(text).split()
    return SentimentDocument(tokens, [index], split, sentiment)

def extract_documents():
    fname = download_dataset()

    index = 0

    with tarfile.open(fname, mode='r:gz') as tar:
        for member in tar.getmembers():
            if re.match(r'aclImdb/(train|test)/(pos|neg|unsup)/\d+_\d+.txt$', member.name):
                member_bytes = tar.extractfile(member).read()
                member_text = member_bytes.decode('utf-8', errors='replace')
                assert member_text.count('\n') == 0
                yield create_sentiment_document(member.name, member_text, index)
                index += 1

alldocs = list(extract_documents())

In [22]:
print(type(alldocs))
print(type(alldocs[0]))

<class 'list'>
<class '__main__.SentimentDocument'>


Here's what a single document looks like.



In [5]:
print(alldocs[27])

SentimentDocument(words=['I', 'was', 'looking', 'forward', 'to', 'this', 'movie.', 'Trustworthy', 'actors,', 'interesting', 'plot.', 'Great', 'atmosphere', 'then', '?????', 'IF', 'you', 'are', 'going', 'to', 'attempt', 'something', 'that', 'is', 'meant', 'to', 'encapsulate', 'the', 'meaning', 'of', 'life.', 'First.', 'Know', 'it.', 'OK', 'I', 'did', 'not', 'expect', 'the', 'directors', 'or', 'writers', 'to', 'actually', 'know', 'the', 'meaning', 'but', 'I', 'thought', 'they', 'may', 'have', 'offered', 'crumbs', 'to', 'peck', 'at', 'and', 'treats', 'to', 'add', 'fuel', 'to', 'the', 'fire-Which!', 'they', 'almost', 'did.', 'Things', 'I', "didn't", 'get.', 'A', 'woman', 'wandering', 'around', 'in', 'dark', 'places', 'and', 'lonely', 'car', 'parks', 'alone-oblivious', 'to', 'the', 'consequences.', 'Great', 'riddles', 'that', 'fell', 'by', 'the', 'wayside.', 'The', 'promise', 'of', 'the', 'knowledge', 'therein', 'contained', 'by', 'the', 'original', 'so-called', 'criminal.', 'I', 'had', 'no

Extract our documents and split into training/test sets.



In [6]:
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
print(f'{len(alldocs)} docs: {len(train_docs)} train-sentiment, {len(test_docs)} test-sentiment')

100000 docs: 25000 train-sentiment, 25000 test-sentiment


Set-up Doc2Vec Training & Evaluation Models
-------------------------------------------

In [7]:
import multiprocessing
from collections import OrderedDict

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

from gensim.models.doc2vec import Doc2Vec

# 1. 使用100 维向量，因为论文的 400 维向量需要大量内存，并且在我们对此任务的测试中，似乎并没有提供太多好处
# 2. 设置 min_count=2，可以节省相当多的模型内存，因为这样做会丢弃那些只在单个文档中出现的词汇，这些词汇本身对于每个文档独特的向量来说并没有更多的表达力
common_kwargs = dict(
    vector_size=100, epochs=20, min_count=2,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=0,
)

simple_models = [
    # PV-DBOW plain
    Doc2Vec(dm=0, **common_kwargs),
    # PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
    Doc2Vec(dm=1, window=10, alpha=0.05, comment='alpha=0.05', **common_kwargs),
    # PV-DM w/ concatenation - big, slow, experimental mode
    # window=5 (both sides) approximates paper's apparent 10-word total window size
    Doc2Vec(dm=1, dm_concat=1, window=5, **common_kwargs),
]

for model in simple_models:
    model.build_vocab(alldocs)
    print(f"{model} vocabulary scanned & state initialized")

models_by_name = OrderedDict((str(model), model) for model in simple_models)

2024-01-18 16:04:25,044 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dbow,d100,n5,mc2,t16>', 'datetime': '2024-01-18T16:04:25.043955', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}
2024-01-18 16:04:25,045 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w10,mc2,t16>', 'datetime': '2024-01-18T16:04:25.045839', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}
2024-01-18 16:04:25,050 : INFO : using concatenative 1100-dimensional layer1
2024-01-18 16:04:25,052 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/c,d100,n5,w5,mc2,t16>', 'datetime': '2024-01-18T16:04:25.052406', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': '

Doc2Vec<dbow,d100,n5,mc2,t16> vocabulary scanned & state initialized


2024-01-18 16:04:40,292 : INFO : PROGRESS: at example #10000, processed 2292381 words (3742551 words/s), 150816 word types, 0 tags
2024-01-18 16:04:40,925 : INFO : PROGRESS: at example #20000, processed 4573645 words (3606258 words/s), 238497 word types, 0 tags
2024-01-18 16:04:41,585 : INFO : PROGRESS: at example #30000, processed 6865575 words (3480488 words/s), 312348 word types, 0 tags
2024-01-18 16:04:42,302 : INFO : PROGRESS: at example #40000, processed 9190019 words (3245780 words/s), 377231 word types, 0 tags
2024-01-18 16:04:42,968 : INFO : PROGRESS: at example #50000, processed 11557847 words (3559929 words/s), 438729 word types, 0 tags
2024-01-18 16:04:43,652 : INFO : PROGRESS: at example #60000, processed 13899883 words (3426442 words/s), 493913 word types, 0 tags
2024-01-18 16:04:44,322 : INFO : PROGRESS: at example #70000, processed 16270094 words (3542988 words/s), 548474 word types, 0 tags
2024-01-18 16:04:45,043 : INFO : PROGRESS: at example #80000, processed 18598876

Doc2Vec<dm/m,d100,n5,w10,mc2,t16> vocabulary scanned & state initialized


2024-01-18 16:04:54,111 : INFO : PROGRESS: at example #10000, processed 2292381 words (3679072 words/s), 150816 word types, 0 tags
2024-01-18 16:04:54,751 : INFO : PROGRESS: at example #20000, processed 4573645 words (3565284 words/s), 238497 word types, 0 tags
2024-01-18 16:04:55,385 : INFO : PROGRESS: at example #30000, processed 6865575 words (3620880 words/s), 312348 word types, 0 tags
2024-01-18 16:04:56,074 : INFO : PROGRESS: at example #40000, processed 9190019 words (3381919 words/s), 377231 word types, 0 tags
2024-01-18 16:04:56,751 : INFO : PROGRESS: at example #50000, processed 11557847 words (3498992 words/s), 438729 word types, 0 tags
2024-01-18 16:04:57,403 : INFO : PROGRESS: at example #60000, processed 13899883 words (3599934 words/s), 493913 word types, 0 tags
2024-01-18 16:04:58,051 : INFO : PROGRESS: at example #70000, processed 16270094 words (3661341 words/s), 548474 word types, 0 tags
2024-01-18 16:04:58,709 : INFO : PROGRESS: at example #80000, processed 18598876

Doc2Vec<dm/c,d100,n5,w5,mc2,t16> vocabulary scanned & state initialized


In [10]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
models_by_name['dbow+dmm'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[1]])
models_by_name['dbow+dmc'] = ConcatenatedDoc2Vec([simple_models[0], simple_models[2]])

2024-01-18 16:07:45,764 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-01-18 16:07:45,765 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)
2024-01-18 16:07:45,767 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2024-01-18T16:07:45.767213', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'created'}


Predictive Evaluation Methods
-----------------------------


In [11]:
import numpy as np
import statsmodels.api as sm
from random import sample

def logistic_predictor_from_data(train_targets, train_regressors):
    """Fit a statsmodel logistic predictor on supplied data"""
    logit = sm.Logit(train_targets, train_regressors)
    predictor = logit.fit(disp=0)
    # print(predictor.summary())
    return predictor

def error_rate_for_model(test_model, train_set, test_set):
    """Report error rate on test_doc sentiments, using supplied model and train_docs"""

    # 从训练数据集中获取目标变量（sentiment）和自变量（特征向量）
    train_targets = [doc.sentiment for doc in train_set]
    train_regressors = [test_model.dv[doc.tags[0]] for doc in train_set]
    train_regressors = sm.add_constant(train_regressors)
    # 使用训练数据拟合 Logistic 回归模型
    predictor = logistic_predictor_from_data(train_targets, train_regressors)

    # 从测试数据集中获取自变量（特征向量）
    test_regressors = [test_model.dv[doc.tags[0]] for doc in test_set]
    test_regressors = sm.add_constant(test_regressors)

    # Predict & evaluate
    # 进行预测并评估模型性能
    test_predictions = predictor.predict(test_regressors)
    corrects = sum(np.rint(test_predictions) == [doc.sentiment for doc in test_set])
    errors = len(test_predictions) - corrects
    error_rate = float(errors) / len(test_predictions)
    return (error_rate, errors, len(test_predictions), predictor)

2024-01-18 16:09:13,166 : INFO : Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2024-01-18 16:09:13,167 : INFO : NumExpr defaulting to 8 threads.


Bulk Training & Per-Model Evaluation
------------------------------------

- On a 4-core 2.6Ghz Intel Core i7, these 20 passes training and evaluating 3 main models takes about an hour.

In [12]:
from collections import defaultdict
error_rates = defaultdict(lambda: 1.0)  # To selectively print only best errors achieved

In [13]:
from random import shuffle
shuffled_alldocs = alldocs[:]
shuffle(shuffled_alldocs)   # 由于原始文档顺序在大块中具有相似情感的文档（这对于训练而言不是最佳选择），因此我们使用训练集的一次打乱副本。

for model in simple_models:
    print(f"Training {model}")
    model.train(shuffled_alldocs, total_examples=len(shuffled_alldocs), epochs=model.epochs)

    print(f"\nEvaluating {model}")
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print("\n%f %s\n" % (err_rate, model))

for model in [models_by_name['dbow+dmm'], models_by_name['dbow+dmc']]:
    print(f"\nEvaluating {model}")
    err_rate, err_count, test_count, predictor = error_rate_for_model(model, train_docs, test_docs)
    error_rates[str(model)] = err_rate
    print(f"\n{err_rate} {model}\n")

2024-01-18 16:09:26,899 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 16 workers on 265408 vocabulary and 100 features, using sg=1 hs=0 sample=0 negative=5 window=5 shrink_windows=True', 'datetime': '2024-01-18T16:09:26.899962', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'train'}


Training Doc2Vec<dbow,d100,n5,mc2,t16>


2024-01-18 16:09:27,936 : INFO : EPOCH 0 - PROGRESS: at 3.98% examples, 898854 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:09:28,952 : INFO : EPOCH 0 - PROGRESS: at 8.80% examples, 987604 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:09:29,958 : INFO : EPOCH 0 - PROGRESS: at 14.13% examples, 1065623 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:09:30,958 : INFO : EPOCH 0 - PROGRESS: at 19.20% examples, 1096083 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:09:31,966 : INFO : EPOCH 0 - PROGRESS: at 24.65% examples, 1128543 words/s, in_qsize 29, out_qsize 2
2024-01-18 16:09:32,975 : INFO : EPOCH 0 - PROGRESS: at 29.89% examples, 1135654 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:09:33,982 : INFO : EPOCH 0 - PROGRESS: at 35.01% examples, 1142305 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:09:34,998 : INFO : EPOCH 0 - PROGRESS: at 40.18% examples, 1145136 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:09:36,007 : INFO : EPOCH 0 - PROGRESS: at 45.34% examples, 1149858 wor


Evaluating Doc2Vec<dbow,d100,n5,mc2,t16>


2024-01-18 16:16:22,911 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 16 workers on 265408 vocabulary and 100 features, using sg=0 hs=0 sample=0 negative=5 window=10 shrink_windows=True', 'datetime': '2024-01-18T16:16:22.911232', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'train'}



0.104640 Doc2Vec<dbow,d100,n5,mc2,t16>

Training Doc2Vec<dm/m,d100,n5,w10,mc2,t16>


2024-01-18 16:16:23,969 : INFO : EPOCH 0 - PROGRESS: at 2.08% examples, 454951 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:16:24,970 : INFO : EPOCH 0 - PROGRESS: at 5.14% examples, 577052 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:16:25,981 : INFO : EPOCH 0 - PROGRESS: at 8.08% examples, 604690 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:16:26,991 : INFO : EPOCH 0 - PROGRESS: at 10.75% examples, 602431 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:16:28,014 : INFO : EPOCH 0 - PROGRESS: at 13.69% examples, 616415 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:16:29,019 : INFO : EPOCH 0 - PROGRESS: at 16.72% examples, 631924 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:16:30,026 : INFO : EPOCH 0 - PROGRESS: at 19.59% examples, 636550 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:16:31,029 : INFO : EPOCH 0 - PROGRESS: at 22.51% examples, 640196 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:16:32,067 : INFO : EPOCH 0 - PROGRESS: at 25.48% examples, 643574 words/s, in


Evaluating Doc2Vec<dm/m,d100,n5,w10,mc2,t16>


2024-01-18 16:27:53,448 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 16 workers on 265409 vocabulary and 1100 features, using sg=0 hs=0 sample=0 negative=5 window=5 shrink_windows=True', 'datetime': '2024-01-18T16:27:53.448076', 'gensim': '4.3.2', 'python': '3.10.11 | packaged by conda-forge | (main, May 10 2023, 19:07:22) [Clang 14.0.6 ]', 'platform': 'macOS-14.1.2-x86_64-i386-64bit', 'event': 'train'}



0.170840 Doc2Vec<dm/m,d100,n5,w10,mc2,t16>

Training Doc2Vec<dm/c,d100,n5,w5,mc2,t16>


2024-01-18 16:27:54,701 : INFO : EPOCH 0 - PROGRESS: at 0.71% examples, 132941 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:27:55,806 : INFO : EPOCH 0 - PROGRESS: at 3.44% examples, 334418 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:27:56,826 : INFO : EPOCH 0 - PROGRESS: at 6.30% examples, 428417 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:27:57,827 : INFO : EPOCH 0 - PROGRESS: at 9.18% examples, 479008 words/s, in_qsize 32, out_qsize 1
2024-01-18 16:27:58,940 : INFO : EPOCH 0 - PROGRESS: at 12.27% examples, 510800 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:27:59,943 : INFO : EPOCH 0 - PROGRESS: at 15.00% examples, 528880 words/s, in_qsize 31, out_qsize 0
2024-01-18 16:28:00,950 : INFO : EPOCH 0 - PROGRESS: at 17.74% examples, 545422 words/s, in_qsize 30, out_qsize 1
2024-01-18 16:28:01,959 : INFO : EPOCH 0 - PROGRESS: at 20.39% examples, 553449 words/s, in_qsize 32, out_qsize 0
2024-01-18 16:28:02,962 : INFO : EPOCH 0 - PROGRESS: at 23.05% examples, 559474 words/s, in_


Evaluating Doc2Vec<dm/c,d100,n5,w5,mc2,t16>

0.294480 Doc2Vec<dm/c,d100,n5,w5,mc2,t16>


Evaluating Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/m,d100,n5,w10,mc2,t16>

0.10436 Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/m,d100,n5,w10,mc2,t16>


Evaluating Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/c,d100,n5,w5,mc2,t16>

0.10468 Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/c,d100,n5,w5,mc2,t16>



Achieved Sentiment-Prediction Accuracy
--------------------------------------
Compare error rates achieved, best-to-worst



In [14]:
print("Err_rate Model")
for rate, name in sorted((rate, name) for name, rate in error_rates.items()):
    print(f"{rate} {name}")

Err_rate Model
0.10436 Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/m,d100,n5,w10,mc2,t16>
0.10464 Doc2Vec<dbow,d100,n5,mc2,t16>
0.10468 Doc2Vec<dbow,d100,n5,mc2,t16>+Doc2Vec<dm/c,d100,n5,w5,mc2,t16>
0.17084 Doc2Vec<dm/m,d100,n5,w10,mc2,t16>
0.29448 Doc2Vec<dm/c,d100,n5,w5,mc2,t16>


Examining Results
======================




Are inferred vectors close to the precalculated ones?
-----------------------------------------------------
答案是：yes

chatgpt:
```
推断向量是否接近预先计算的向量：

对于给定文档，通过模型的 infer_vector 方法生成推断向量。
使用 model.dv.most_similar 比较推断向量与预先计算向量（在模型训练期间学到的向量）的相似性。
分析相似性度量，例如余弦相似度，以确定它们是否接近。
```


In [15]:
doc_id = np.random.randint(len(simple_models[0].dv))  # Pick random doc; re-run cell for more examples
print(f'for doc {doc_id}...')
for model in simple_models:
    inferred_docvec = model.infer_vector(alldocs[doc_id].words)
    print(f'{model}:\n {model.dv.most_similar([inferred_docvec], topn=3)}')

for doc 22045...
Doc2Vec<dbow,d100,n5,mc2,t16>:
 [(22045, 0.983492910861969), (14967, 0.6748805642127991), (44450, 0.6319491267204285)]
Doc2Vec<dm/m,d100,n5,w10,mc2,t16>:
 [(22045, 0.9121325612068176), (20179, 0.6432890295982361), (17084, 0.6375844478607178)]
Doc2Vec<dm/c,d100,n5,w5,mc2,t16>:
 [(22045, 0.851923942565918), (41172, 0.44848307967185974), (73935, 0.43554893136024475)]


In [25]:
print(doc_id)
model.infer_vector(alldocs[doc_id].words)

4120


array([-0.31993604,  0.289178  ,  0.14579888,  0.00182639,  0.11769766,
       -0.07497452, -0.07197658, -0.23473649, -0.05723703,  0.11302054,
        0.11737183,  0.07000908,  0.14287762, -0.12425712,  0.05049792,
        0.24176294,  0.11396959,  0.08082096,  0.18286124,  0.06378017,
       -0.05264067, -0.0252564 ,  0.1564372 ,  0.07018003, -0.01600431,
        0.07235863,  0.072377  , -0.3053269 ,  0.00692569,  0.34639913,
        0.02783467, -0.02809503, -0.00299475,  0.29456362,  0.05935742,
        0.21410444, -0.08218914, -0.0369949 , -0.1395597 ,  0.08618931,
       -0.30340457,  0.1142082 , -0.1401991 ,  0.11388415, -0.14379087,
       -0.11195017, -0.189112  ,  0.10105586,  0.4137812 ,  0.2162641 ,
       -0.16204152,  0.17210808, -0.23493302,  0.11430741, -0.12606007,
       -0.1291185 ,  0.19591467, -0.13090394,  0.07826701, -0.02584916,
        0.05703186,  0.3836018 , -0.11259355, -0.1252029 , -0.04769507,
        0.04754781,  0.32207566,  0.10975969,  0.37091225,  0.13

Do close documents seem more related than distant ones?
-------------------------------------------------------

- 答案：yes
- 最高余弦相似的文档通常看起来更像目标，而不是中值或最小…… 特别是当最相似的文档的余弦相似度 > 0.5 时
- 注：余弦相似度是一种衡量两个向量方向一致程度的方法，范围在 [-1, 1] 之间，其中 1 表示完全相似，-1 表示完全不相似。
- 注意：这儿的doc_id应该是对应的tag

<div class="alert alert-danger"><h4>疑问</h4>
<p>这儿没弄明白的是：怎么算相距近，怎么算相距远</p>
</div>

##### 输出结果说明
- 下面是gpt说的（如果没有重新执行的话）
```
目标文档 (4120)：这是一条对电影的负面评论，评论内容表达了对一部电影的强烈不满，指责其在演员表演、剧本、摄影等方面都表现糟糕，甚至质疑这是否算是一部电影。

相似 / 不相似文档 (来自模型 Doc2Vec<dbow,d100,n5,mc2,t16>)：
    最相似 (3961, 相似度 0.68)：一条对另一部电影的负面评论，强烈建议不要浪费时间和金钱观看。
    中间相似 (14646, 相似度 0.32)：一条对版本《哈姆雷特》的正面评论，赞扬了演员 Derek Jacobi 的出色表演，认为这是对莎士比亚故事的真实演绎。
    最不相似 (67885, 相似度 -0.04)：一条对电影《王子与贫儿》的评论，描述了该电影基于马克·吐温的小说，对演员表现、故事情节等方面进行了评价。
这段代码最后的问题 "Do close documents seem more related than distant ones?" 是在问接近的文档是否比远离的文档更相关。
根据输出的结果，可以看出相似度较高的文档确实在主题上有一定的相似性，而相似度较低的文档在主题上可能差异较大。
```

In [16]:
import random

doc_id = np.random.randint(len(simple_models[0].dv))  # pick random doc, re-run cell for more examples
model = random.choice(simple_models)  # and a random model
sims = model.dv.most_similar(doc_id, topn=len(model.dv))  # get *all* similar documents
print(f'TARGET ({doc_id}): «{" ".join(alldocs[doc_id].words)}»\n')
print(f'SIMILAR/DISSIMILAR DOCS PER MODEL {model}%s:\n')
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    s = sims[index]
    i = sims[index][0]
    words = ' '.join(alldocs[i].words)
    print(f'{label} {s}: «{words}»\n')

TARGET (4120): «No. Just NO. That's all that needs to be said.<br /><br />Summary: A random guy is in a cornfield. For some reason, I'm not sure, but it's his duty to run around inside. The next great thriller?<br /><br />A five year old could make a better movie just filming an anthill, or even just grass growing. Seriously.....<br /><br />You can't say it has bad acting, because there is NO acting. You can't say it has bad writing, because it has NO writing. You can't say it has bad cinematography, because there is NO cinematography. You can't say it's a bad movie, BECAUSE THERE IS NO MOVIE! If you don't believe me, go watch it. Just don't say I never warned you.....»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dbow,d100,n5,mc2,t16>%s:

MOST (3961, 0.680057168006897): «This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say.»

MEDIAN (14646, 0.3156801164150238): «I saw this version of Hamlet on television many years ago, and have 

In [27]:
sims[:3]

[(3961, 0.680057168006897),
 (94327, 0.6712723970413208),
 (6554, 0.6663431525230408)]

Do the word vectors show useful similarities?
---------------------------------------------

- DBOW 显示都是不相似单词
    - 这是因为 gensim DBOW 模型不训练词向量
    – 除非您使用 dbow_words=1 初始化参数进行询问
    - 并发单词训练会显着减慢 DBOW 模式，并且对 IMDB 情感预测任务的错误率几乎没有改善（有时甚至会稍微恶化）
    - 但可能适合其他任务，或者如果您还需要单词向量。
- DM 模式本质上都涉及与文档向量训练同时进行的词向量训练
    - 所以，当训练数据中有很多示例时（例如 “情节” 或 “演员”），DM 模型中的单词往往会显示有意义的相似单词


In [17]:
import random

word_models = simple_models[:]

def pick_random_word(model, threshold=10):
    # pick a random word with a suitable number of occurences
    while True:
        word = random.choice(model.wv.index_to_key)
        if model.wv.get_vecattr(word, "count") > threshold:
            return word

target_word = pick_random_word(word_models[0])
# or uncomment below line, to just pick a word from the relevant domain:
# target_word = 'comedy/drama'

for model in word_models:
    print(f'target_word: {repr(target_word)} model: {model} similar words:')
    for i, (word, sim) in enumerate(model.wv.most_similar(target_word, topn=10), 1):
        print(f'    {i}. {sim:.2f} {repr(word)}')
    print()

target_word: '"Zombi' model: Doc2Vec<dbow,d100,n5,mc2,t16> similar words:
    1. 0.49 'networks,'
    2. 0.46 'infamy,'
    3. 0.43 'biologically'
    4. 0.40 'vampires?'
    5. 0.40 'Mo-Wan'
    6. 0.39 'subculture,'
    7. 0.39 'Entertainment.'
    8. 0.38 'crossroads,'
    9. 0.38 '("Sea'
    10. 0.38 'contribution).'

target_word: '"Zombi' model: Doc2Vec<dm/m,d100,n5,w10,mc2,t16> similar words:
    1. 0.59 '"Jaws'
    2. 0.58 '"Exorcist'
    3. 0.58 '"Ring'
    4. 0.55 '(2004).'
    5. 0.55 '(1961).'
    6. 0.54 '"Deathstalker'
    7. 0.53 '(1976)'
    8. 0.53 'Monster".'
    9. 0.53 '"Robocop'
    10. 0.52 'Jaws:'

target_word: '"Zombi' model: Doc2Vec<dm/c,d100,n5,w5,mc2,t16> similar words:
    1. 0.56 'Demoni'
    2. 0.54 'way!",'
    3. 0.52 'aspect!'
    4. 0.51 'regard),'
    5. 0.50 'Avenging'
    6. 0.50 '"Between'
    7. 0.49 'Altair'
    8. 0.49 'Brigham'
    9. 0.49 '"Terminator'
    10. 0.49 "Seek.'"



Are the word vectors from this dataset any good at analogies?
-------------------------------------------------------------

- DBOW 模型的未经训练的随机初始化词当然会惨败(成功率: 0%)
- 对于实际训练单词向量的 DM/mean 和 DM/concat 模型
    - 尽管这是一个很小的、特定领域的数据集，但它在一般单词类比方面显示出一些微薄的能力
    - 成功率都在17%左右

In [18]:
from gensim.test.utils import datapath
questions_filename = datapath('questions-words.txt')

# Note: this analysis takes many minutes
for model in word_models:
    score, sections = model.wv.evaluate_word_analogies(questions_filename)
    correct, incorrect = len(sections[-1]['correct']), len(sections[-1]['incorrect'])
    print(f'{model}: {float(correct*100)/(correct+incorrect):0.2f}%% correct ({correct} of {correct+incorrect}')

2024-01-18 16:43:31,133 : INFO : Evaluating word analogies for top 300000 words in the model on /Users/zhaoweiguo/9tool/miniconda3/envs/chatgpt/lib/python3.10/site-packages/gensim/test/test_data/questions-words.txt
2024-01-18 16:43:35,282 : INFO : capital-common-countries: 0.0% (0/420)
2024-01-18 16:43:43,705 : INFO : capital-world: 0.0% (0/902)
2024-01-18 16:43:44,517 : INFO : currency: 0.0% (0/86)
2024-01-18 16:43:58,647 : INFO : city-in-state: 0.0% (0/1510)
2024-01-18 16:44:03,387 : INFO : family: 0.0% (0/506)
2024-01-18 16:44:12,659 : INFO : gram1-adjective-to-adverb: 0.0% (0/992)
2024-01-18 16:44:19,701 : INFO : gram2-opposite: 0.0% (0/756)
2024-01-18 16:44:32,003 : INFO : gram3-comparative: 0.0% (0/1332)
2024-01-18 16:44:41,857 : INFO : gram4-superlative: 0.0% (0/1056)
2024-01-18 16:44:51,079 : INFO : gram5-present-participle: 0.0% (0/992)
2024-01-18 16:45:04,798 : INFO : gram6-nationality-adjective: 0.0% (0/1445)
2024-01-18 16:45:19,334 : INFO : gram7-past-tense: 0.0% (0/1560)
2

Doc2Vec<dbow,d100,n5,mc2,t16>: 0.00%% correct (0 of 13617


2024-01-18 16:45:38,834 : INFO : Evaluating word analogies for top 300000 words in the model on /Users/zhaoweiguo/9tool/miniconda3/envs/chatgpt/lib/python3.10/site-packages/gensim/test/test_data/questions-words.txt
2024-01-18 16:45:42,828 : INFO : capital-common-countries: 5.0% (21/420)
2024-01-18 16:45:51,385 : INFO : capital-world: 0.9% (8/902)
2024-01-18 16:45:52,188 : INFO : currency: 0.0% (0/86)
2024-01-18 16:46:06,575 : INFO : city-in-state: 0.3% (4/1510)
2024-01-18 16:46:11,485 : INFO : family: 37.5% (190/506)
2024-01-18 16:46:20,619 : INFO : gram1-adjective-to-adverb: 3.1% (31/992)
2024-01-18 16:46:27,571 : INFO : gram2-opposite: 5.6% (42/756)
2024-01-18 16:46:40,485 : INFO : gram3-comparative: 42.9% (572/1332)
2024-01-18 16:46:50,885 : INFO : gram4-superlative: 22.2% (234/1056)
2024-01-18 16:47:00,487 : INFO : gram5-present-participle: 21.9% (217/992)
2024-01-18 16:47:14,427 : INFO : gram6-nationality-adjective: 4.4% (63/1445)
2024-01-18 16:47:29,698 : INFO : gram7-past-tense:

Doc2Vec<dm/m,d100,n5,w10,mc2,t16>: 17.66%% correct (2405 of 13617


2024-01-18 16:47:53,687 : INFO : capital-common-countries: 2.9% (12/420)
2024-01-18 16:48:02,085 : INFO : capital-world: 0.6% (5/902)
2024-01-18 16:48:02,889 : INFO : currency: 0.0% (0/86)
2024-01-18 16:48:16,534 : INFO : city-in-state: 0.1% (1/1510)
2024-01-18 16:48:21,346 : INFO : family: 37.7% (191/506)
2024-01-18 16:48:30,619 : INFO : gram1-adjective-to-adverb: 5.8% (58/992)
2024-01-18 16:48:37,614 : INFO : gram2-opposite: 4.2% (32/756)
2024-01-18 16:48:50,275 : INFO : gram3-comparative: 35.4% (471/1332)
2024-01-18 16:49:00,059 : INFO : gram4-superlative: 23.9% (252/1056)
2024-01-18 16:49:09,328 : INFO : gram5-present-participle: 35.8% (355/992)
2024-01-18 16:49:21,914 : INFO : gram6-nationality-adjective: 1.7% (24/1445)
2024-01-18 16:49:36,461 : INFO : gram7-past-tense: 27.0% (421/1560)
2024-01-18 16:49:47,903 : INFO : gram8-plural: 7.6% (91/1190)
2024-01-18 16:49:56,107 : INFO : gram9-plural-verbs: 49.8% (433/870)
2024-01-18 16:49:56,109 : INFO : Quadruplets with out-of-vocabular

Doc2Vec<dm/c,d100,n5,w5,mc2,t16>: 17.23%% correct (2346 of 13617
