# 10.3 word2vec的实现

In [256]:
import collections
import math
import random
import sys
import time
import os
import numpy as np

import tensorflow as tf 

sys.path.append("..") 

# Config 

In [258]:
debug = True

debug_num = 30


# 保留至少出现n次的词汇
min_count = 1
vector_size = 100
windows = 5
negative = 5
epoches = 10

# whether skip-gram algorithm
sg = 1

initial_lr, min_lr = 0.01, 0.01

ns_exponent = 0.75

sample = 1e-4

seed = 1

np.random.seed(seed)

# Read Data

PTB（Penn Tree Bank）是一个常用的小型语料库 [1]。它采样自《华尔街日报》的文章，包括训练集、验证集和测试集。我们将在PTB训练集上训练词嵌入模型。该数据集的每一行作为一个句子。句子中的每个词由空格隔开。

In [161]:
assert 'ptb.train.txt' in os.listdir("../../data/ptb")

In [242]:
with open('../../data/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    # st是sentence的缩写
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

'# sentences: 42068'

In [243]:
if debug:
    raw_dataset = raw_dataset[:debug_num]
#     raw_dataset = [
#         'you are good'.split(' ')
#         , 'you are good'.split(' ')
#     ]
    

In [244]:
len(raw_dataset)

42068

对于数据集的前3个句子，打印每个句子的词数和前5个词。这个数据集中句尾符为“<eos>”，生僻词全用“<unk>”表示，数字则被替换成了“N”。

In [245]:
for st in raw_dataset[:5]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']
# tokens: 23 ['rudolph', '<unk>', 'N', 'years', 'old']
# tokens: 34 ['a', 'form', 'of', 'asbestos', 'once']


# Data Process

### 10.3.1.1 建立词语索引

In [246]:
# tk是token的缩写
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= min_count, counter.items()))

In [247]:
vocab_size = len(counter)

In [248]:
len(counter)

9999

In [251]:
# raw_dataset

In [252]:
max([len(sen) for sen in raw_dataset])

82

然后将词映射到整数索引。

In [249]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887521'

In [173]:
# idx_to_token
# token_to_idx

### 10.3.1.2 二次采样

文本数据中一般会出现一些高频词，如英文中的“the”“a”和“in”。通常来说，在一个背景窗口中，一个词（如“chip”）和较低频词（如“microprocessor”）同时出现比和较高频词（如“the”）同时出现对训练词嵌入模型更有益。因此，训练词嵌入模型时可以对词进行二次采样 [2]。
具体来说，数据集中每个被索引词$w_i$将有一定概率被丢弃，该丢弃概率为

$$ P(w_i) = \max\left(1 - \sqrt{\frac{t}{f(w_i)}}, 0\right),$$ 

其中 $f(w_i)$ 是数据集中词$w_i$的个数与总词数之比，常数$t$是一个超参数（实验中设为$10^{-4}$）。可见，只有当$f(w_i) > t$时，我们才有可能在二次采样中丢弃词$w_i$，并且越高频的词被丢弃的概率越大。

In [174]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        sample / counter[idx_to_token[idx]] * num_tokens)


# if debug:
#     subsampled_dataset = dataset
# else:
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 376095'

可以看到，二次采样后我们去掉了一半左右的词。下面比较一个词在二次采样前后出现在数据集中的次数。可见高频词“the”的采样率不足1/20。

In [175]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

compare_counts('the')

'# the: before=50770, after=2110'

但低频词“join”则完整地保留了下来。

### 10.3.1.3 提取中心词和背景词


我们将与中心词距离不超过背景窗口大小的词作为它的背景词。下面定义函数提取出所有中心词和它们的背景词。它每次在整数1和`max_window_size`（最大背景窗口）之间随机均匀采样一个整数作为背景窗口大小。

In [176]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
#         print(centers)
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

下面我们创建一个人工数据集，其中含有词数分别为7和3的两个句子。设最大背景窗口为2，打印所有中心词和它们的背景词。

In [177]:
# tiny_dataset = [list(range(7)), list(range(7, 10))]
# print('dataset', tiny_dataset)

In [178]:
# for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
#     print('center', center, 'has contexts', context)

实验中，我们设最大背景窗口大小为5。下面提取数据集中所有的中心词及其背景词。

In [179]:
# subsampled_dataset

In [180]:
windows

5

In [181]:
# subsampled_dataset

In [182]:
def get_single_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
#         centers += st
#         print(centers)
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            for idx in indices:
                centers.append(st[center_i])
                contexts.append([st[idx]])
    return centers, contexts

In [221]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, max_window_size=windows)

In [183]:
all_centers_tf, all_contexts_tf = get_single_centers_and_contexts(subsampled_dataset, max_window_size=windows)

In [94]:
len(all_centers_tf)

289

In [95]:
len(all_contexts_tf)

289

In [102]:
# all_contexts_tf

In [103]:
len(all_centers)

91

In [104]:
len(all_contexts)

91

In [106]:
# all_contexts

In [107]:
# all_centers

In [108]:
# all_centers

In [109]:
# all_contexts

In [110]:
# for center, context in zip(all_centers, all_contexts):
#     print('center', center, 'has contexts', context)

## 10.3.2 负采样

我们使用负采样来进行近似训练。对于一对中心词和背景词，我们随机采样$K$个噪声词（实验中设$K=5$）。根据word2vec论文的建议，噪声词采样概率$P(w)$设为$w$词频与总词频之比的0.75次方 [2]。

In [184]:
# all_contexts

In [185]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重（sampling_weights）随机生成k个词的索引作为噪声词。
                # 为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

In [113]:
sampling_weights = [counter[w]**ns_exponent for w in idx_to_token]
# sampling_weights

In [222]:
all_negatives = get_negatives(all_contexts, sampling_weights, K=negative)

In [223]:
len(all_negatives)

375168

In [186]:
all_negatives_tf = get_negatives(all_contexts_tf, sampling_weights, K=negative)

In [118]:
len(all_negatives_tf)

289

In [121]:
# [len(ele) for ele in (all_negatives)]

## 10.3.3 读取数据

我们从数据集中提取所有中心词`all_centers`，以及每个中心词对应的背景词`all_contexts`和噪声词`all_negatives`。我们将通过随机小批量来读取它们。

在一个小批量数据中，第$i$个样本包括一个中心词以及它所对应的$n_i$个背景词和$m_i$个噪声词。由于每个样本的背景窗口大小可能不一样，其中背景词与噪声词个数之和$n_i+m_i$也会不同。在构造小批量时，我们将每个样本的背景词和噪声词连结在一起，并添加填充项0直至连结后的长度相同，即长度均为$\max_i n_i+m_i$（`max_len`变量）。为了避免填充项对损失函数计算的影响，我们构造了掩码变量`masks`，其每一个元素分别与连结后的背景词和噪声词`contexts_negatives`中的元素一一对应。当`contexts_negatives`变量中的某个元素为填充项时，相同位置的掩码变量`masks`中的元素取0，否则取1。为了区分正类和负类，我们还需要将`contexts_negatives`变量中的背景词和噪声词区分开来。依据掩码变量的构造思路，我们只需创建与`contexts_negatives`变量形状相同的标签变量`labels`，并将与背景词（正类）对应的元素设1，其余清0。

下面我们实现这个小批量读取函数`batchify`。它的小批量输入`data`是一个长度为批量大小的列表，其中每个元素分别包含中心词`center`、背景词`context`和噪声词`negative`。该函数返回的小批量数据符合我们需要的格式，例如，包含了掩码变量。

In [185]:
def batchify(data):
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        center=center.numpy().tolist()
        context=context.numpy().tolist()
        negative=negative.numpy().tolist()
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return tf.data.Dataset.from_tensor_slices((tf.reshape(tf.convert_to_tensor(centers),shape=(-1, 1)), tf.convert_to_tensor(contexts_negatives),
            tf.convert_to_tensor(masks), tf.convert_to_tensor(labels)))

我们用刚刚定义的`batchify`函数指定`DataLoader`实例中小批量的读取方式，然后打印读取的第一个批量中各个变量的形状。

In [186]:
def generator():
    for cent, cont, neg in zip(all_centers,all_contexts,all_negatives):
        yield (cent, cont, neg)

In [187]:
batch_size = 512
dataset=tf.data.Dataset.from_generator(generator=generator,output_types=(tf.int32,tf.int32, tf.int32))
dataset = dataset.apply(batchify).shuffle(len(all_centers)).batch(batch_size)

In [193]:
for batch in dataset:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: (101, 1)
contexts_negatives shape: (101, 42)
masks shape: (101, 42)
labels shape: (101, 42)


## 10.3.4 跳字模型
### 10.3.4.1 嵌入层

获取词嵌入的层称为嵌入层，在Keras中可以通过创建`layers.Embedding`实例得到。嵌入层的权重是一个矩阵，其行数为词典大小（`input_dim`），列数为每个词向量的维度（`output_dim`）。我们设词典大小为20，词向量的维度为4。

In [195]:
# embed = tf.keras.layers.Embedding(input_dim=20, output_dim=4)
# embed.build(input_shape=(1,20))
# embed.get_weights()

嵌入层的输入为词的索引。输入一个词的索引$i$，嵌入层返回权重矩阵的第$i$行作为它的词向量。下面我们将形状为(2, 3)的索引输入进嵌入层，由于词向量的维度为4，我们得到形状为(2, 3, 4)的词向量。

In [197]:
# x = tf.convert_to_tensor([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
# embed(x)

### 10.3.4.2 小批量乘法

我们可以使用小批量乘法运算`batch_dot`对两个小批量中的矩阵一一做乘法。假设第一个小批量中包含$n$个形状为$a\times b$的矩阵$\boldsymbol{X}_1, \ldots, \boldsymbol{X}_n$，第二个小批量中包含$n$个形状为$b\times c$的矩阵$\boldsymbol{Y}_1, \ldots, \boldsymbol{Y}_n$。这两个小批量的矩阵乘法输出为$n$个形状为$a\times c$的矩阵$\boldsymbol{X}_1\boldsymbol{Y}_1, \ldots, \boldsymbol{X}_n\boldsymbol{Y}_n$。因此，给定两个形状分别为($n$, $a$, $b$)和($n$, $b$, $c$)的`NDArray`，小批量乘法输出的形状为($n$, $a$, $c$)。

In [199]:
# X = tf.ones((2, 1, 4))
# Y = tf.ones((2, 4, 6))
# tf.matmul(X, Y).shape

### 10.3.4.3 跳字模型前向计算

在前向计算中，跳字模型的输入包含中心词索引`center`以及连结的背景词与噪声词索引`contexts_and_negatives`。其中`center`变量的形状为(批量大小, 1)，而`contexts_and_negatives`变量的形状为(批量大小, `max_len`)。这两个变量先通过词嵌入层分别由词索引变换为词向量，再通过小批量乘法得到形状为(批量大小, 1, `max_len`)的输出。输出中的每个元素是中心词向量与背景词向量或噪声词向量的内积。

In [200]:
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = tf.matmul(v, tf.transpose(u,perm=[0,2,1]))
    return pred

# TF low level API


### 10.3.5.1 二元交叉熵损失函数
根据负采样中损失函数的定义，我们可以直接使用Keras的二元交叉熵损失函数`BinaryCrossEntropyLoss`。

In [66]:
class SigmoidBinaryCrossEntropyLoss(tf.keras.losses.Loss):
    def __init__(self): # none mean sum
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def __call__(self, inputs, targets, mask=None):
        #tensorflow中使用tf.nn.weighted_cross_entropy_with_logits设置mask并没有起到作用
        #直接与mask按元素相乘回实现当mask为0时不计损失的效果
        inputs=tf.cast(inputs,dtype=tf.float32)
        targets=tf.cast(targets,dtype=tf.float32)
        mask=tf.cast(mask,dtype=tf.float32)
        res=tf.nn.sigmoid_cross_entropy_with_logits(inputs, targets)*mask
        return tf.reduce_mean(res,axis=1)

loss = SigmoidBinaryCrossEntropyLoss()

值得一提的是，我们可以通过掩码变量指定小批量中参与损失函数计算的部分预测值和标签：当掩码为1时，相应位置的预测值和标签将参与损失函数的计算；当掩码为0时，相应位置的预测值和标签则不参与损失函数的计算。我们之前提到，掩码变量可用于避免填充项对损失函数计算的影响。

In [202]:
pred = tf.convert_to_tensor([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]],dtype=tf.float32)
# 标签变量label中的1和0分别代表背景词和噪声词
label = tf.convert_to_tensor([[1, 0, 0, 0], [1, 1, 0, 0]],dtype=tf.float32)
mask = tf.convert_to_tensor([[1, 1, 1, 1], [1, 1, 1, 0]],dtype=tf.float32)  # 掩码变量
loss(label, pred, mask) * mask.shape[1] / tf.reduce_sum(mask,axis=1)

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.8739895, 1.2099689], dtype=float32)>

作为比较，下面将从零开始实现二元交叉熵损失函数的计算，并根据掩码变量mask计算掩码为1的预测值和标签的损失。

In [203]:
# def sigmd(x):
#     return - math.log(1 / (1 + math.exp(-x)))

# print('%.4f' % ((sigmd(1.5) + sigmd(-0.3) + sigmd(1) + sigmd(-2)) / 4)) # 注意1-sigmoid(x) = sigmoid(-x)
# print('%.4f' % ((sigmd(1.1) + sigmd(-0.6) + sigmd(-2.2)) / 3))

### 10.3.5.2 初始化模型参数

我们分别构造中心词和背景词的嵌入层，并将超参数词向量维度`embed_size`设置成100

In [204]:
embed_size = vector_size
net = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(idx_to_token), output_dim=embed_size),
    tf.keras.layers.Embedding(input_dim=len(idx_to_token), output_dim=embed_size)
])
net.get_layer(index=0)

<keras.layers.core.embedding.Embedding at 0x7ff63fe443d0>

In [205]:
net

<keras.engine.sequential.Sequential at 0x7ff63fe44bb0>

In [209]:
net.get_layer(index=1)

<keras.layers.core.embedding.Embedding at 0x7ff63fe44610>

### 10.3.5.3 定义训练函数
下面定义训练函数。由于填充项的存在，与之前的训练函数相比，损失函数的计算稍有不同。

In [210]:
def train(net, lr, num_epochs):
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in dataset:
            center, context_negative, mask, label = [d for d in batch]
            mask=tf.cast(mask,dtype=tf.float32)
            with tf.GradientTape(persistent=True) as tape:
                pred = skip_gram(center, context_negative, net.get_layer(index=0), net.get_layer(index=1))
                # 使用掩码变量mask来避免填充项对损失函数计算的影响
                l = (loss(label, tf.reshape(pred,label.shape), mask) *
                     mask.shape[1] / tf.reduce_sum(mask,axis=1))
                l=tf.reduce_mean(l)# 一个batch的平均loss
                
            grads = tape.gradient(l, net.variables)
            optimizer.apply_gradients(zip(grads, net.variables))
            l_sum += np.array(l).item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, l_sum / n, time.time() - start))

In [211]:
train(net, lr=initial_lr, num_epochs=epoches)

epoch 1, loss 0.69, time 0.04s
epoch 2, loss 0.69, time 0.02s
epoch 3, loss 0.68, time 0.01s
epoch 4, loss 0.67, time 0.02s
epoch 5, loss 0.65, time 0.02s
epoch 6, loss 0.64, time 0.02s
epoch 7, loss 0.62, time 0.01s
epoch 8, loss 0.60, time 0.02s
epoch 9, loss 0.57, time 0.01s
epoch 10, loss 0.55, time 0.02s


## 10.3.6 应用词嵌入模型

In [212]:
def get_similar_tokens(query_token, k, embed):
    W = embed.get_weights()
    W = tf.convert_to_tensor(W[0])
    x = W[token_to_idx[query_token]]
    x = tf.reshape(x,shape=[-1,1])
    # 添加的1e-9是为了数值稳定性
    cos = tf.reshape(tf.matmul(W, x),shape=[-1])/ tf.sqrt(tf.reduce_sum(W * W, axis=1) * tf.reduce_sum(x * x) + 1e-9)
    _, topk = tf.math.top_k(cos, k=k+1)
    topk=topk.numpy().tolist()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
        


In [213]:
W = net.get_layer(index=0).get_weights()[0]
W.shape


(295, 100)

In [141]:
# token_to_idx

In [142]:
# W[token_to_idx['you']]

get_similar_tokens('chip', 3, net.get_layer(index=0)) -->

# TF High Level API

In [124]:
# min([len(con)+len(neg) for con, neg in zip(all_contexts, all_negatives)])


In [187]:
# len(all_centers)

In [188]:
embedding_dim = vector_size
context_negative_num = negative + 1
context_negative_num

6

In [189]:
from tensorflow.keras import layers
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=context_negative_num)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots

In [190]:
f"{vocab_size}, {embedding_dim}"

'9999, 100'

In [191]:
all_centers[:2]

[10, 15]

In [192]:
# all_contexts[:2]

In [193]:
# all_negatives[:2]

In [194]:
# targets[:2]

In [195]:
# contexts[:2]

In [196]:
# /labels[:2]

In [197]:
def generate_training_data(all_centers, all_contexts, all_negatives):
#     max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in zip(all_centers, all_contexts, all_negatives):
        center=[center]
        context=context
        negative=negative
#         cur_len = len(context) + len(negative)
        centers += [center]
        context_len = min(3, len(context))
        contexts_negatives += [context[:context_len] + negative[:context_negative_num-context_len]]
        assert len(contexts_negatives[-1]) == context_negative_num, f"{len(contexts_negatives)}; {len(context)}; {len(negative)}"
        
#         masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * context_len + [0] * (context_negative_num-context_len)]
    return centers, contexts_negatives, labels

In [198]:
targets, contexts, labels = generate_training_data(all_centers_tf, all_contexts_tf, all_negatives_tf)


targets = np.array(targets).reshape(len(targets), )
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")




targets.shape: (1688665,)
contexts.shape: (1688665, 6)
labels.shape: (1688665, 6)


In [228]:
# all_centers[:3]

In [229]:
# all_contexts[:3]

In [216]:
# labels[:3]

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [230]:
# targets[:100]

In [231]:
# contexts[:30]

In [199]:
# targets

In [200]:
# labels

In [201]:
# for center, context, negative in zip(all_centers, all_contexts, all_negatives):
#     print(center)
#     print(context)
#     print(negative)
#     break

In [202]:
# batch_size = 512
# dataset=tf.data.Dataset.from_generator(generator=generator,output_types=(tf.int32,tf.int32, tf.int32))
# dataset = dataset.apply(batchify).shuffle(len(all_centers)).batch(batch_size)

In [203]:
BATCH_SIZE = 512
BUFFER_SIZE = len(all_centers)
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 6), dtype=tf.int64, name=None))>


In [204]:
# print(dataset)

In [205]:
# for ele in dataset:
#     print(ele)

In [206]:
# with tf.Session() as session:
#     print(session.run(dataset))

In [207]:
# loss

In [254]:


word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [255]:
word2vec.fit(dataset, 
             epochs=5,
#              validation_split=0.3
#              callbacks=[tensorboard_callback]
            )

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (42068,) + inhomogeneous part.

In [234]:
w2vec_tf = word2vec.get_layer('w2v_embedding')

In [235]:
def get_similar_token_TF(query_token, k, embed):
    W = embed.get_weights()
    W = tf.convert_to_tensor(W[0])
    x = W[token_to_idx[query_token]]
    x = tf.reshape(x,shape=[-1,1])
    # 添加的1e-9是为了数值稳定性
    cos = tf.reshape(tf.matmul(W, x),shape=[-1])/ tf.sqrt(tf.reduce_sum(W * W, axis=1) * tf.reduce_sum(x * x) + 1e-9)
    _, topk = tf.math.top_k(cos, k=k+1)
    topk=topk.numpy().tolist()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
        


In [236]:
# all_centers

In [237]:
get_similar_token_TF('university', 10, w2vec_tf)

cosine sim=0.699: associate
cosine sim=0.673: barnett
cosine sim=0.639: graduate
cosine sim=0.603: fusion
cosine sim=0.578: devised
cosine sim=0.566: shake
cosine sim=0.553: sherman
cosine sim=0.549: hopkins
cosine sim=0.540: hyman
cosine sim=0.534: musical


In [241]:
get_similar_token_TF('amsterdam', 10, w2vec_tf)

cosine sim=0.753: zurich
cosine sim=0.750: stockholm
cosine sim=0.632: exchange-rate
cosine sim=0.627: sydney
cosine sim=0.619: wellington
cosine sim=0.617: reviews
cosine sim=0.612: prices
cosine sim=0.612: pork
cosine sim=0.611: taipei
cosine sim=0.607: shook


# Gesim High-level API 

In [259]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1
w2v = Word2Vec(sentences=raw_dataset, vector_size=vector_size, window=windows, min_count=min_count
              , alpha=initial_lr, min_alpha=min_lr, negative=negative
              , epochs=epoches
              , sg=sg
              , sample=sample
              , compute_loss=True
              , seed=seed
              , callbacks=[callback()])

Loss after epoch 0: 2260088.0
Loss after epoch 1: 1892368.75
Loss after epoch 2: 1560652.25
Loss after epoch 3: 1495446.0
Loss after epoch 4: 1381980.0
Loss after epoch 5: 1182334.0
Loss after epoch 6: 1174722.0
Loss after epoch 7: 1164838.0
Loss after epoch 8: 1193226.0
Loss after epoch 9: 1148993.0


# Result compare

#  TF implementation VS gensim

In [143]:
# idx_to_token

In [146]:
get_similar_tokens('university', 10, net.get_layer(index=0))

cosine sim=0.688: professor
cosine sim=0.536: stanford
cosine sim=0.485: college
cosine sim=0.478: anti-nuclear
cosine sim=0.461: school
cosine sim=0.448: michigan
cosine sim=0.445: mural
cosine sim=0.443: researcher
cosine sim=0.441: legg
cosine sim=0.436: therapy


In [147]:
get_similar_tokens('amsterdam', 10, net.get_layer(index=0))

cosine sim=0.605: manila
cosine sim=0.584: stockholm
cosine sim=0.563: sydney
cosine sim=0.551: milan
cosine sim=0.526: wellington
cosine sim=0.513: paris
cosine sim=0.508: zurich
cosine sim=0.506: elsewhere
cosine sim=0.488: killed
cosine sim=0.480: taipei


In [148]:
get_similar_tokens('china', 10, net.get_layer(index=0))

cosine sim=0.442: kgb
cosine sim=0.434: establishment
cosine sim=0.430: crisis
cosine sim=0.416: curb
cosine sim=0.415: u.s.s.r.
cosine sim=0.411: censorship
cosine sim=0.408: conditional
cosine sim=0.404: beijing
cosine sim=0.403: administration
cosine sim=0.400: chinese


In [149]:
get_similar_tokens('beijing', 10, net.get_layer(index=0))

cosine sim=0.479: mideast
cosine sim=0.460: sovereignty
cosine sim=0.457: freeze
cosine sim=0.455: talks
cosine sim=0.441: reform
cosine sim=0.438: brady
cosine sim=0.426: nelson
cosine sim=0.426: defeated
cosine sim=0.419: two-day
cosine sim=0.418: mansion


In [96]:
w2v.wv.most_similar('university', topn=10) 

[('professor', 0.8840214014053345),
 ('graduate', 0.8317833542823792),
 ('school', 0.8139637112617493),
 ('laboratory', 0.8013948202133179),
 ('kentucky', 0.796154260635376),
 ('harvard', 0.7923484444618225),
 ('maryland', 0.7897931337356567),
 ('researcher', 0.7841968536376953),
 ('associate', 0.7811244130134583),
 ('science', 0.7800496220588684)]

In [99]:
w2v.wv.most_similar('china', topn=10) 

[('beijing', 0.8760640621185303),
 ('chinese', 0.8031082153320312),
 ('sovereignty', 0.7945157885551453),
 ('taiwan', 0.7892906665802002),
 ('nations', 0.763671338558197),
 ('pro-democracy', 0.7571192979812622),
 ('colony', 0.7522749900817871),
 ('communist', 0.7520454525947571),
 ('diplomatic', 0.7403035759925842),
 ('kong', 0.7398679256439209)]

In [103]:
w2v.wv.most_similar('amsterdam', topn=10) 

[('stockholm', 0.9868001937866211),
 ('zurich', 0.9825881719589233),
 ('milan', 0.9605680108070374),
 ('brussels', 0.9582435488700867),
 ('moderately', 0.9368679523468018),
 ('mixed', 0.9228634834289551),
 ('sydney', 0.9123021364212036),
 ('firmer', 0.9113779067993164),
 ('taipei', 0.9100077152252197),
 ('frankfurt', 0.9095544815063477)]

In [68]:
def compare_vector(word):
    print(W[token_to_idx[word]])
    print(w2v.wv[word])

In [69]:
compare_vector('university')

[-0.22740479 -1.4507823 ]
[0.52161103 1.4714768 ]


In [70]:
compare_vector('cars')

[ 0.7373704  -0.25241235]
[ 1.1872251  -0.02579357]


In [71]:
compare_vector('good')

[ 0.19601762 -0.85054344]
[0.8336281  0.61083436]
