In [None]:
%matplotlib inline

In [1]:
!pip install gensim

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple



Word2Vec Model
==============

#### 说明

- Total running time of the script: ( 11 minutes 26.674 seconds)
- Estimated memory usage: 7177 MB


In [9]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## 注意
该模型大约为 2GB，因此您需要良好的网络连接才能继续

In [None]:
import gensim.downloader as api
# 加载预训练的 Word2Vec 模型
# 这个模型是使用大规模的 Google News 文本数据进行训练的，其中包含了丰富的语义信息
wv = api.load('word2vec-google-news-300')   # 最终的本质是: 先下载，然后执行 KeyedVectors.load_word2vec_format(path, binary=False)
wv

- 模拟输出
```
<gensim.models.keyedvectors.KeyedVectors at 0x7f257f225ea0>
```

In [None]:
wv.index_to_key

- 模拟输出
```
['</s>',
 'in',
 'for',
 'that',
 ...
]
```

In [None]:
# 常见操作: 检索模型的词汇表
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

- 模拟输出
```
word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said
```

In [None]:
# 模型的术语的向量
vec_king = wv['king']   # 获取单词 "king" 的向量表示
vec_king

- 模拟输出
```
array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
        ... ...
       -2.79296875e-01, -8.59375000e-02,  9.13085938e-02,  2.51953125e-01],
      dtype=float32)
```

In [None]:
vec_king.shape   # (300,)

In [None]:
# 该模型无法推断不熟悉的单词的向量, 这是 Word2Vec 的一个限制
# 如果此限制对您很重要，请查看 FastText 模型
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")

In [None]:
# Word2Vec 支持多个开箱即用的单词相似性任务
# 您可以看到随着单词越来越不相似，相似度如何直观地降低
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

- 模拟输出
```
'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06
```

In [None]:
# 写出 5 个与 “car” 或 “minivan” 最相似的单词
print(wv.most_similar(positive=['car', 'minivan'], topn=5))
# 输出:
# [('SUV', 0.8532192707061768), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763688564300537), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565720081329346)]

In [None]:
# 哪项不属于该序列
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))
# 输出
# car

Training Your Own Model
-----------------------

- 使用 Lee 评估语料库（如果您安装了 Gensim，则已经拥有该语料库）
- 这个语料库足够小，可以完全放入内存中
- 但我们将实现一个内存友好的迭代器，它可以逐行读取它，以演示如何处理更大的语料库。

In [3]:
from gensim.test.utils import datapath
datapath('lee_background.cor')
# linux输出
# /usr/local/lib/python3.10/dist-packages/gensim/test/test_data/lee_background.cor

'/Users/zhaoweiguo/9tool/miniconda3/envs/chatgpt/lib/python3.10/site-packages/gensim/test/test_data/lee_background.cor'

In [4]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [None]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)
print("==========================")
print(model)       # <gensim.models.word2vec.Word2Vec at 0x7f257f2267a0>
print(model.wv)    # <gensim.models.keyedvectors.KeyedVectors at 0x7f25ac59ed10>   和上面的wv一样

In [15]:
vec_king = model.wv['king'] #  “wv” stands for “word vectors”
vec_king.shape    # (300,)
vec_king

array([-0.02174288,  0.06308484,  0.01441088,  0.01603901,  0.01316035,
       -0.11271596,  0.04764793,  0.11911392, -0.00727676, -0.02178196,
       -0.00759635, -0.07258018,  0.00744819,  0.03911889,  0.00879618,
        0.01496025, -0.00348121, -0.00139711, -0.02534691, -0.08332758,
        0.04878865,  0.01100308,  0.01575529, -0.00158456, -0.02796627,
        0.0264222 , -0.02473329, -0.01437838, -0.03901039,  0.01756936,
        0.04521987, -0.05508058,  0.04803083, -0.04291599, -0.01334529,
        0.06593297,  0.02051216,  0.01240187, -0.02542784, -0.04047228,
       -0.01836985,  0.00515179, -0.01495974,  0.01715535,  0.03786959,
       -0.02594631, -0.03560946, -0.00415782,  0.011525  ,  0.04263784,
        0.02222399, -0.03364095, -0.02248785,  0.00086933, -0.02121293,
        0.02388584, -0.00050786,  0.00049492, -0.03180176,  0.00444909,
       -0.01861753,  0.0002589 ,  0.01320993, -0.00435303, -0.03973737,
        0.08123787,  0.02082963,  0.04370801, -0.05350183,  0.06

In [16]:
# 和上面一样的检索词汇表操作
for index, word in enumerate(model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/1750 is the
word #1/1750 is to
word #2/1750 is of
word #3/1750 is in
word #4/1750 is and
word #5/1750 is he
word #6/1750 is is
word #7/1750 is for
word #8/1750 is on
word #9/1750 is said


Storing and loading models
--------------------------

- 使用标准 gensim 方法存储 / 加载模型：

In [None]:
import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    # uses pickle internally
    new_model = gensim.models.Word2Vec.load(temporary_filepath)


- 加载由原始 C 工具创建的模型(格式说明，不能运行)

In [None]:
# load models created by the original C tool, both using its text and binary formats:
model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False)
# using gzipped/bz2 input works too, no need to unzip
model = gensim.models.KeyedVectors.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)

Training Parameters
-------------------

- ``Word2Vec`` accepts several parameters that affect both training speed and quality.
- ``Word2Vec`` 是用于训练 Word Embedding 模型的函数

#### min_count

- 用于修剪内部字典。在十亿个单词的语料库中只出现一两次的单词可能是无趣的拼写错误和垃圾。此外，没有足够的数据来对这些单词进行任何有意义的训练，因此最好忽略它们：
- default value of min_count=5
- A reasonable value for min_count is between 0-100, depending on the size of your dataset.

In [None]:
model = gensim.models.Word2Vec(sentences, min_count=10)

#### vector_size

- gensim Word2Vec 将单词映射到的 N-dimensional 的维数 (N)
- 较大的尺寸值需要更多的训练数据，但可以产生更好（更准确）的模型
- 合理的值在数十到数百之间。

In [None]:
# The default value of vector_size is 100.
model = gensim.models.Word2Vec(sentences, vector_size=200)

#### workers

- 用于训练并行化，以加快训练速度
- workers 参数仅在安装了 `Cython <http://cython.org/>`_ 时才有效。如果没有 Cython，如使用 GIL(GlobalInterpreterLock)，您将只能使用一个核心

In [None]:
# default value of workers=3 (tutorial says 1...)
model = gensim.models.Word2Vec(sentences, workers=4)

Memory
------

- word2vec 模型参数存储为矩阵（NumPy 数组）
- 每个数组都是 #vocabulary(唯一单词数)乘以浮点数（单精度，即 4 个字节）的向量大小（ vector_size 参数）
- #vocabulary(唯一单词数)可以通过 min_count 参数控制
- RAM 中保存了三个这样的矩阵（正在努力将该数字减少到两个，甚至一个）
- 因此，如果您的输入包含 100,000 个唯一单词，并且您要求层 vector_size=200 ，则模型将需要大约
```
100,000*200*4*3 bytes = ~229MB
```



Evaluating
----------

- Word2Vec 训练是一项无监督任务，没有好的方法来客观评估结果。评估取决于您的最终应用。
- 谷歌已经发布了大约 20,000 个句法和语义测试示例的测试集，遵循 “A is to B as C is to D” 任务。它在 “数据集” 文件夹中提供。
- 比较类型的句法类比是 ``bad:worse;good:?`` 。数据集中共有 9 种句法比较，如复数名词和相反含义的名词。
- 语义问题包含五种类型的语义类比，例如首都 ( Paris:France;Tokyo:? ) 或家庭成员 ( brother:sister;dad:? )


questions-words.txt 文件中可能包含的一些语法和语义比较类型的示例：
```
名词的复数形式：
    例如：boy boys, girl girls

名词的对应形式：
    例如：go going, walk walking

国家与其首都：
    例如：France Paris, Japan Tokyo

形容词的比较级：
    例如：good better, bad worse

名词的所有格：
    例如：man's men, woman's women

名词与其相关的形容词：
    例如：fish fishy, water watery

动词的时态：
    例如：run running, eat eating

相似关系：
    例如：cat kitten, dog puppy

反义词：
    例如：happy sad, hot cold
```

Gensim supports the same evaluation set, in exactly the same format:




In [None]:
# 用于评估模型在词汇类比任务上的性能
# questions-words.txt 是一个包含词汇类比问题的文本文件，通常用于测试模型在理解词汇关系方面的能力
# 模型会尝试回答类似 “a 与 b 相同，c 与 d 相同，那么 a 与 c 的关系是什么” 的问题
# 方法返回一个包含评估结果的字典，其中包括准确率等信息
model.wv.evaluate_word_analogies(datapath('questions-words.txt'))

- 2016年12月发布的 Gensim 中，我们添加了一种更好的方法来评估语义相似性
- 默认情况下，它使用学术数据集 WS-353，但您可以基于它创建特定于您的业务的数据集
- 它包含单词对以及人工指定的相似性判断。它衡量两个单词的相关性或共现性。

In [None]:
# 用于评估模型在词汇相似性任务上的性能
# wordsim353.tsv 是一个包含词汇相似性评估数据的文件，通常包括人工标注的词对相似度
# 
model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))

#### 注意
在 Google 或 WS-353 测试集上的良好性能并不意味着 word2vec 在您的应用程序中运行良好，反之亦然。最好直接评估您的预期任务。

Online training / Resuming training
-----------------------------------

Advanced users can load a model and continue training it with more sentences
and `new vocabulary words <online_w2v_tutorial.ipynb>`_:




In [None]:
model = gensim.models.Word2Vec.load(temporary_filepath)
more_sentences = [
    ['Advanced', 'users', 'can', 'load', 'a', 'model',
     'and', 'continue', 'training', 'it', 'with', 'more', 'sentences'],
]
model.build_vocab(more_sentences, update=True)
model.train(more_sentences, total_examples=model.corpus_count, epochs=model.epochs)

# cleaning up temporary file
import os
os.remove(temporary_filepath)

Training Loss Computation
-------------------------

- 参数 compute_loss 可用于在训练 Word2Vec 模型时进行损失计算

In [None]:
# instantiating and training the Word2Vec model
model_with_loss = gensim.models.Word2Vec(
    sentences,          # 输入的句子列表
    min_count=1,        # 忽略出现次数小于1的词汇
    compute_loss=True,  # 计算训练过程中的损失值
    hs=0,               # 使用负采样而不是层级softmax
    sg=1,               # 使用skip-gram模型
    seed=42,            # 随机数种子，以确保可复现性
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)    # 1357486.875
# 损失值是衡量模型在训练期间学习效果的指标，通常表示模型预测与实际值之间的差异程度。
# 在 Word2Vec 模型中，损失值越低表示模型在学习词嵌入时拟合训练数据得越好。
# 损失值是模型性能的一个指标，但不应孤立地看待。结合其他评估指标、任务的具体要求以及可能的过拟合情况来全面判断模型的训练效果。

Benchmarks
----------

我们将使用以下数据进行基准测试：
```
1. Lee Background corpus: included in gensim’s test data
2. Text8 corpus. To demonstrate the effect of corpus size, we’ll look at the first 1MB, 10MB, 50MB of the corpus, as well as the entire thing.
```

In [None]:
import io
import os

import gensim.models.word2vec
import gensim.downloader as api
import smart_open


def head(path, size):
    with smart_open.open(path) as fin:
        return io.StringIO(fin.read(size))


def generate_input_data():
    lee_path = datapath('lee_background.cor')
    ls = gensim.models.word2vec.LineSentence(lee_path)
    ls.name = '25kB'
    yield ls

    text8_path = api.load('text8').fn
    labels = ('1MB', '10MB', '50MB', '100MB')
    sizes = (1024 ** 2, 10 * 1024 ** 2, 50 * 1024 ** 2, 100 * 1024 ** 2)
    for l, s in zip(labels, sizes):
        ls = gensim.models.word2vec.LineSentence(head(text8_path, s))
        ls.name = l
        yield ls


input_data = list(generate_input_data())
# 输出:
# [<gensim.models.word2vec.LineSentence at 0x7f257f4bf250>,
#  <gensim.models.word2vec.LineSentence at 0x7f248dce6f80>,
#  <gensim.models.word2vec.LineSentence at 0x7f248dce7940>,
#  <gensim.models.word2vec.LineSentence at 0x7f248dce5180>,
#  <gensim.models.word2vec.LineSentence at 0x7f248dce6680>]


- 比较输入数据和模型训练参数（例如 hs 和 sg ）的不同组合所需的训练时间
- 对于每个组合，我们重复测试几次以获得测试持续时间的平均值和标准差

In [None]:
# Temporarily reduce logging verbosity
logging.root.level = logging.ERROR

import time
import numpy as np
import pandas as pd

train_time_values = []
seed_val = 42
sg_values = [0, 1]
hs_values = [0, 1]

fast = True
if fast:
    input_data_subset = input_data[:3]
else:
    input_data_subset = input_data


for data in input_data_subset:
    for sg_val in sg_values:
        for hs_val in hs_values:
            for loss_flag in [True, False]:
                time_taken_list = []
                for i in range(3):
                    start_time = time.time()
                    w2v_model = gensim.models.Word2Vec(
                        data,
                        compute_loss=loss_flag,
                        sg=sg_val,
                        hs=hs_val,
                        seed=seed_val,
                    )
                    time_taken_list.append(time.time() - start_time)

                time_taken_list = np.array(time_taken_list)
                time_mean = np.mean(time_taken_list)
                time_std = np.std(time_taken_list)

                model_result = {
                    'train_data': data.name,
                    'compute_loss': loss_flag,
                    'sg': sg_val,
                    'hs': hs_val,
                    'train_time_mean': time_mean,
                    'train_time_std': time_std,
                }
                print("Word2vec model #%i: %s" % (len(train_time_values), model_result))
                train_time_values.append(model_result)

train_times_table = pd.DataFrame(train_time_values)
train_times_table = train_times_table.sort_values(
    by=['train_data', 'sg', 'hs', 'compute_loss'],
    ascending=[False, False, True, False],
)
print(train_times_table)


#### 模拟输出
```
    compute_loss  hs  sg train_data  train_time_mean  train_time_std
4           True   0   1       25kB         0.472116        0.015137
5          False   0   1       25kB         0.469522        0.003345
6           True   1   1       25kB         0.950259        0.005153
7          False   1   1       25kB         0.942416        0.009776
0           True   0   0       25kB         0.252174        0.020227
1          False   0   0       25kB         0.258985        0.026276
2           True   1   0       25kB         0.419408        0.002198
3          False   1   0       25kB         0.430876        0.001000
12          True   0   1        1MB         1.506507        0.036966
13         False   0   1        1MB         1.537814        0.010207
14          True   1   1        1MB         3.302257        0.045232
15         False   1   1        1MB         3.492871        0.193276
8           True   0   0        1MB         0.644114        0.009346
9          False   0   0        1MB         0.656217        0.027036
10          True   1   0        1MB         1.315072        0.094572
11         False   1   0        1MB         1.205833        0.005159
20          True   0   1       10MB        20.357308        0.412410
21         False   0   1       10MB        21.380845        1.690947
22          True   1   1       10MB        44.487718        1.131427
23         False   1   1       10MB        44.517535        1.447279
16          True   0   0       10MB         7.446084        0.789432
17         False   0   0       10MB         7.060013        0.213669
18          True   1   0       10MB        14.277136        0.744163
19         False   1   0       10MB        13.758649        0.373940

```

Visualising Word Embeddings
---------------------------

- 可视化可用于注意到数据中的语义和句法趋势:
```
* Semantic: words like cat, dog, cow, etc. have a tendency to lie close by
* Syntactic: words like run, running or cut, cutting lie close together.
```

Vector relations like vKing - vMan = vQueen - vWoman can also be noticed.

#### 注意
.. Important::
  The model used for the visualisation is trained on a small corpus. Thus
  some of the relations might not be so clear.


In [None]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

In [None]:

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)


<img src="https://img.zhaoweiguo.com/uPic/2024/01/QK9i3z.png" width="30%">


In [1]:
from gensim.test.utils import datapath
from gensim import utils

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('lee_background.cor')
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)


In [2]:
import gensim.models

sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences)

In [8]:
model.wv.vectors

array([[-0.23801143,  0.7230952 ,  0.17190379, ..., -0.11578286,
         0.081784  , -0.08193262],
       [-0.28703138,  0.82561386,  0.20189007, ..., -0.11783595,
         0.0756435 , -0.09346155],
       [-0.2604952 ,  0.7762954 ,  0.16527511, ..., -0.11252926,
         0.0964959 , -0.10583613],
       ...,
       [-0.03199477,  0.07584364,  0.02609465, ..., -0.00832574,
        -0.00141254, -0.00856047],
       [-0.01083308,  0.0550551 ,  0.02093291, ..., -0.01879125,
         0.00293344, -0.00378424],
       [-0.03690437,  0.10409687,  0.02872949, ..., -0.01226656,
         0.00522907, -0.01340222]], dtype=float32)

In [18]:
import numpy as np
vectors_raw = np.asarray(model.wv.vectors)
vectors_raw

array([[-0.23801143,  0.7230952 ,  0.17190379, ..., -0.11578286,
         0.081784  , -0.08193262],
       [-0.28703138,  0.82561386,  0.20189007, ..., -0.11783595,
         0.0756435 , -0.09346155],
       [-0.2604952 ,  0.7762954 ,  0.16527511, ..., -0.11252926,
         0.0964959 , -0.10583613],
       ...,
       [-0.03199477,  0.07584364,  0.02609465, ..., -0.00832574,
        -0.00141254, -0.00856047],
       [-0.01083308,  0.0550551 ,  0.02093291, ..., -0.01879125,
         0.00293344, -0.00378424],
       [-0.03690437,  0.10409687,  0.02872949, ..., -0.01226656,
         0.00522907, -0.01340222]], dtype=float32)

In [15]:
model.wv.index_to_key[:10]

['the', 'to', 'of', 'in', 'and', 'he', 'is', 'for', 'on', 'said']

In [23]:
labels = np.asarray(model.wv.index_to_key)
print(len(labels))
labels

1750


array(['the', 'to', 'of', ..., 'finally', 'separate', 'owen'],
      dtype='<U14')

In [25]:
from sklearn.manifold import TSNE                   # final reduction
num_dimensions=2
tsne = TSNE(n_components=num_dimensions, random_state=0)
vectors = tsne.fit_transform(vectors_raw)
print(len(vectors))
vectors

1750


array([[ 71.41756  ,  -6.135798 ],
       [ 74.32616  ,  -1.0158837],
       [ 72.787445 ,  -2.61733  ],
       ...,
       [-45.747684 , -13.382935 ],
       [-57.922096 ,  -5.313386 ],
       [-11.694438 ,  18.641695 ]], dtype=float32)