# 训练词向量



## 利用`gensim.models.word2vec`训练词向量
- 原始语料为[中文维基](https://dumps.wikimedia.org/zhwiki/)

0. 原始语料为 `xml` 格式，需要提取出正文，使用 `WikiExtractor` 包
    0. 命令行提取正文：`python WikiExtractor.py -b 500M -o wiki zhwiki-20190720-pages-articles-multistream.xml.bz2`
    0. 获得的文件中，正文被包含在 `<doc></doc>` 标签内
0. 或者 `gensim.corpora.WikiCorpus` 直接处理 `xml.bz2` 文件
0. 由上两步，获得的文本先经过预处理，**每一行一句话，单词间用空格隔开**

In [None]:
!python WikiExtractor.py -b 500M -o datasets/wiki datasets/zhwiki-20190720.xml.bz2

In [None]:
import re

def preprocess_zhwiki_v1():
    # 提取文本信息，分句、分词、繁体转简体，然后将单词用空格连接
    regex = re.compile("(^<doc.*>$)|(^</doc>$)")
    sent_spliter = re.compile("。|！|？")

    input_file = open(input_file_path, 'r', encoding='utf-8')
    output_file = open(output_file_path, 'w+', encoding='utf-8')

    line = input_file.readline()
    while line:
        if line.strip() and not regex.match(line):
            sentences = sent_spliter.split(line)
            for s in sentences:
                s = zhconv.convert(s, 'zh-cn')
                words = jieba.cut(s.strip('\n'))
                sent = ' '.join(words)
                output_file.write(sent + '\n')
        line = input_file.readline()

    input_file.close()
    output_file.close()

In [None]:
from gensim.corpora import WikiCorpus


def preprocess_zhwiki_v2():
    # 提取文本信息，分句、分词、繁体转简体，然后将单词用空格连接
    # WikiCorpus 会将标点符号都被删除
    space = ' '
    i = 0
    l = []

    output_file = open(output_file_path, 'w+', encoding='utf-8')

    wiki = WikiCorpus(input_file_path, lemmatize=False, dictionary={})
    for text in wiki.get_texts():
        for temp_sentence in text:
            temp_sentence = zhconv.convert(s, 'zh-cn')
            seg_list = list(jieba.cut(temp_sentence))
            for temp_term in seg_list:
                l.append(temp_term)
        output_file.write(space.join(l) + '\n')
        l = []
        i = i + 1

        if (i % 200 == 0):
            print('Saved ' + str(i) + ' articles')
    f.close()


input_file_path = r'datasets/wiki/AA/wiki_00'
output_file_path = r'datasets/wiki/AA/wiki_corpus'
preprocess_zhwiki_v2()

- 利用上一步生成的处理后的满足 `LineSentence` 格式的文本，创建模型

In [None]:
from gensim.models import word2vec

corpus_path = output_file_path
model_path = r"models/wiki_corpus.model"


def build_model(corpus_path):
    wiki_news = word2vec.LineSentence(corpus_path)
    model = word2vec.Word2Vec(
        wiki_news,
        sg=0,  # 模型类型 CBOW
        size=50,  # 词向量维度     
        window=5,  # 窗口尺寸
        min_count=5, # 忽略词频少于 5 的单词
        workers=9)
    model.save(model_path)
    return model


build_model()

- 验证训练得到的模型

In [None]:
from gensim.models import word2vec
model_path = r"models/zhwiki.50d.word2vec"
model = word2vec.Word2Vec.load(model_path)

In [None]:
model.wv.most_similar('数学')

In [None]:
model.wv.most_similar('哲学')

In [None]:
model.wv.most_similar(positive=['女人', '国王'], negative=['男人'])

In [None]:
two_corpus = ["腾讯","阿里巴巴"]
res = model.wv.similarity(two_corpus[0],two_corpus[1])
print("similarity:%.4f"%res)

- 将词向量降维后进行可视化

In [None]:
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.sans-serif'] = ['SimHei']
%matplotlib inline

word_vectors = model.wv


def get_model_matrix(word_vectors, required_words):
    import random
    words = list(word_vectors.vocab.keys())
    random.shuffle(words)
    words = words[:10000]
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(word_vectors.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(word_vectors.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind


words = [
    '数学', '算术', '公理', '积分', '统计', '善恶', '哲学', '伦理', '中国政府', '美国国会', '武侠小说',
    '风靡', '海内外', '受欢迎', '通俗小说', '中华人民共和国', '文化大革命', '反思', '伤痕', '一批', '白话文',
    '诗人', '古诗', '欢迎', '中华民国', '撤退', '台湾', '区别', '思潮', '过渡时期', '通称', '文献', '兴趣',
    '钻研', '语言学', '神秘主义', '更加', '经典', '历史学', '文学', '学术界', '享有', '前所未有', '趋势',
    '受到', '人文主义者', '巨量', '规则', '机器人', '精准', '身躯', '脑', '视频', '确保', '高质量', '适中',
    '价格', '软件设计', '构成', '互补', '并行', '系统分析', '程序设计', '支持', '高级', '课程', '训练',
    '工业', '技能', '羧酸', '柠檬酸', '高效率', '肽键', '细胞骨架', '细胞周期', '氯仿', '甘油', '变型',
    '鞘', '类固醇', '醛', '酮', '糖原', '单糖', '半乳糖', '葡萄糖', '糖苷键', '含氮', '杂环', '嘌呤',
    '辅酶', '底物', '化学能', '磷酸化', '哈康', '延斯', '挪威海', '捕鲸', '挪威政府', '成人礼', '巴伦支海',
    '哥德堡', '区域规划', '润州', '邳州市', '东海县', '丹阳市', '武进区', '临河', '嘈杂', '霰弹枪', '讲席',
    '一滴', '调换', '香港金融管理局', '美圆', '金管局', '毫', '大额', '铜币', '一圆', '镍币', '爆竹',
    '管理科', '中区', '收兑', '财政司'
]

M, word2Ind = get_model_matrix(word_vectors, words)

In [None]:
# 利用 svd 算法进行降维
def reduce_to_k_dim(M, k=2):
    n_iters = 10
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    svd = TruncatedSVD(n_components=k, n_iter=n_iters)
    M_reduced = svd.fit_transform(M)
    print("Done.")
    return M_reduced

M_reduced = reduce_to_k_dim(M)

In [None]:
def plot_embeddings(M_reduced, word2Ind, words):
    fig, ax = plt.subplots(1, 1, figsize=(24,24))
    for word in words:
        index = word2Ind[word]
        x, y = M_reduced[index]
        plt.scatter(x, y, marker='o', color='red')
        plt.text(x, y, word, fontsize=9)
        
plot_embeddings(M_reduced, word2Ind, words)

In [None]:
# 利用 TSNE 算法进行降维
from sklearn.manifold import TSNE

def tsne_plot(M, word2Ind, words):

    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    M_reduced = tsne_model.fit_transform(M)

    fig, ax = plt.subplots(1, 1, figsize=(32, 32))
    for word in words:
        index = word2Ind[word]
        x, y = M_reduced[index]
        ax.scatter(x, y, marker='o', color='red')
        ax.text(x, y, word, fontsize=9)
        
tsne_plot(M, word2Ind, words)

# TSNE 降维效果比 SVD 要好，但效率更低

- 关键词提取，从 `wv.most_similar()` 出发获取给定单词的相关单词
    - 词向量 `wv.most_similar()` 获得的为出现在相似上下文中的同类词，并不是通常语义含义上的相似词

In [None]:
from collections import defaultdict


def get_related_words(initial_words, model):
    unseen = [initial_words]
    seen = defaultdict(int)

    max_size = 500

    while unseen and len(seen) < max_size:
        if len(seen) % 50 == 0:
            print('search length: {}'.format(len(seen)))

        node = unseen.pop(0)
        new_expanding = [w for w, _ in model.most_similar(node, topn=20)]
        unseen += new_expanding

        seen[node] += 1
    return seen


actions = get_related_words("说", word_vectors)
actions

- `wordcloud` 实现词云

In [None]:
# 培根散文集的词云
data_path = r'datasets/Bacon Francis - Essays.txt'

import os

from os import path
from wordcloud import WordCloud

text = open(data_path).read()

wordcloud = WordCloud().generate(text)

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

## `tensorflow.nn.sampled_softmax_loss` 训练词向量

- 利用上述预处理后的文件
- 需要根据语料创建词汇表
- 需要将句子分词后的词语列表，转变成 中心词-上下文词 组成的词对

In [None]:
import zipfile
import os
from collections import Counter
import random
import bz2

import random
import math
from six.moves import xrange

import zipfile
import tensorflow as tf

import collections

import numpy as np

tf.test.is_gpu_available()

In [None]:
# 将原始文件转化成单词列表
# 大段的文本直接利用空格拆分
def read_data(file_path):
    with zipfile.ZipFile(file_path) as f:
        words = tf.compat.as_text(f.read(f.namelist()[0])).split()
    return words


filename = r'datasets/text8.zip'
vocabulary = read_data(filename)

# 文本中总单词量
print('Data size', len(vocabulary))

# 词汇表大小
len(set(vocabulary))

In [None]:
# 创建词典，替换稀有词
vocabulary_size = 10000


def build_dataset(words, n_words):
    # 单词-词频
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))

    # 词典：单词-索引
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)

    # 文本向量化
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count

    # 索引-单词，词典
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


data, count, dictionary, reverse_dictionary = build_dataset(
    vocabulary, vocabulary_size)
del vocabulary

print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

- 生成批处理数据
    - 将文本`the quick brown fox jumped over the lazy dog`转换为数组 $[123,6,53,24,5,12,89,8,11]$
    - $skip\_window = 2$，对于中心词 `53` 则上下文-中心词对$ [(123，53),(6，53),(24，53),(5,53]$
    - $num\_skips = 3 $，从每个窗口词对中选择 3 个，如 $[(123,53),(24,53),(5,53)]$
    - 遍历所有中心词，将词对转化成解包，如 batch 为中心词数组 $[53,53,53]$，labels 为上下文词数组 $[123, 24, 5]$

In [None]:
# 生成用于 skip-gram 模型的训练数据
data_index = 0


def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j] = buffer[context_word]
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels


batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
          reverse_dictionary[labels[i, 0]])

In [None]:
batch

In [None]:
[reverse_dictionary[i] for i in batch]

In [None]:
# 模型参数

batch_size = 128
embedding_size = 64  # 词向量的维度
skip_window = 1  # 窗口大小
num_skips = 2  # 每个窗口的词对中选择多少对
num_sampled = 64  # 负采样时负样本数量

valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [None]:
# 创建模型，tf.nn.sampled_softmax_loss
initializer_softmax = tf.keras.initializers.GlorotUniform()
# Variables:
embeddings_weight = tf.Variable(
    tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weight = tf.Variable(
    initializer_softmax([vocabulary_size, embedding_size]))
softmax_bias = tf.Variable(initializer_softmax([vocabulary_size]))

optimizer = tf.keras.optimizers.Adam()

num_step = 100001
for step in range(num_step):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                                skip_window)
    batch_inputs = tf.cast(batch_inputs, tf.int32)
    batch_labels = tf.cast(batch_labels, tf.int32)

    with tf.GradientTape() as tape:
        embed = tf.nn.embedding_lookup(embeddings_weight, batch_inputs)
        loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weight,
                                       biases=softmax_bias,
                                       inputs=embed,
                                       labels=batch_labels,
                                       num_sampled=num_sampled,
                                       num_classes=vocabulary_size))
    variables = [embeddings_weight, softmax_weight, softmax_bias]
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

In [None]:
embeddings_weight

## `tensorflow.nn.nce_loss` 训练词向量

In [None]:
# 创建模型
batch_size = 64
vocabulary_size = len(word2ind)
embedding_dimension = 5
negative_samples = 8
LOG_DIR = 'logs/word2vec'

embeddings = tf.Variable(
    tf.random.uniform([vocabulary_size, embedding_dimension], -1.0, 1.0))
nce_weights = tf.Variable(
    tf.random.truncated_normal([vocabulary_size, embedding_dimension],
                               stddev=1.0 / math.sqrt(embedding_dimension)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

variables = [embeddings, nce_weights, nce_weights]


def loss_fn(embed, labels):
    return tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       inputs=embed,
                       labels=labels,
                       num_sampled=negative_samples,
                       num_classes=vocabulary_size))


optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

for i in range(1000):
    x_batch, y_batch = generate_batch(batch_size)
    x_batch = tf.constant(x_batch, dtype=tf.int32)
    y_batch = tf.constant(y_batch, dtype=tf.int32)
    with tf.GradientTape() as tape:
        embed = tf.nn.embedding_lookup(embeddings, x_batch)
        loss = loss_fn(embed, y_batch)
    grads = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(grads, variables))

In [None]:
embeddings

## `tensorflow`负采样训练词向量

In [1]:
import collections
import math
import random
import sys
import time
import os
import numpy as np
import tensorflow as tf

### 处理数据集
华尔街日报的文章，每一行一个句子，句子中每个词由空格隔开

In [2]:
assert 'ptb.train.txt' in os.listdir('datasets/ptb')

# 句子列表，每个句子为单词组成的列表
with open('datasets/ptb/ptb.train.txt', 'r') as f:
    lines = f.readlines()
    raw_dataset = [st.split() for st in lines]

print(len(raw_dataset))
raw_dataset[0]

42068


['aer',
 'banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim',
 'snack-food',
 'ssangyong',
 'swapo',
 'wachter']

In [3]:
for st in raw_dataset[:3]:
    print('# tokens:', len(st), st[:5])

# tokens: 24 ['aer', 'banknote', 'berlitz', 'calloway', 'centrust']
# tokens: 15 ['pierre', '<unk>', 'N', 'years', 'old']
# tokens: 11 ['mr.', '<unk>', 'is', 'chairman', 'of']


### 建立词语索引

In [4]:
# 统计单词出现的频次，删除词频<5的单词
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))
len(counter), counter['learn']

(9858, 25)

In [5]:
# 创建单词与索引映射
idx2token = [tk for tk, _ in counter.items()]
token2idx = {tk: idx for idx, tk in enumerate(idx2token)}

# 将数据集转化为索引列表
dataset = [[token2idx[tk] for tk in st if tk in token2idx]
           for st in raw_dataset]

# 语料总单词树
num_tokens = sum([len(st) for st in dataset])
num_tokens

887100

### 二次采样
文本数据中一般会出现一些高频词，如英文中的“the”“a”和“in”，在一个背景窗口中，一个词（如“chip”）和较低频词（如“microprocessor”）同时出现比和较高频词（如“the”）同时出现对训练词嵌入模型更有益。因此，训练词嵌入模型时可以对词进行二次采样。 具体来说，数据集中每个被索引词$w_i$将有一定概率被丢弃，该丢弃概率为
$$ P(w_i) = \max\left(1 - \sqrt{\frac{t}{f(w_i)}}, 0\right)$$
其中 $f(w_i)$ 是数据集中词$w_i$的个数与总词数之比，常数$t$是一个超参数（实验中设为$10^{-4}$）。可见，只有当$f(w_i) > t$时，我们才有可能在二次采样中丢弃词$w_i$，并且越高频的词被丢弃的概率越大。

In [6]:
# 是否丢弃单词，该单词在语料中出现的总次数越高，越可能被丢弃
def discard(idx):
    return random.uniform(
        0, 1) < 1 - math.sqrt(1e-4 / counter[idx2token[idx]] * num_tokens)

# 重采样后数据集
subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]

# 语料总单词数大大减少
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375402'

In [7]:
# 二次采样后，单词被保留的次数
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (
        token, sum([st.count(token2idx[token]) for st in dataset]),
        sum([st.count(token2idx[token]) for st in subsampled_dataset]))


print(compare_counts('the'))
# 高频词the被保留的次数从 50770 降低到 2153
print(compare_counts('join'))
# 低频词join基本都保留

# the: before=50770, after=2114
# join: before=45, after=45


### 提取中心词和背景词
每次在整数1和max_window_size（最大背景窗口）之间随机均匀采样一个整数作为背景窗口大小。每个中心词，对应一个背景词列表

In [8]:
# 每个中心词对应一个背景词列表
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st  # 每句话所有单词都是中心词
        for i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(
                range(max(0, i - window_size), min(len(st),
                                                   i + window_size + 1)))
            indices.remove(i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [9]:
tiny_dataset = [list(range(5)), list(range(8, 10))]
print('dataset', tiny_dataset)
for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
    print('center', center, 'has contexts', context)

dataset [[0, 1, 2, 3, 4], [8, 9]]
center 0 has contexts [1, 2]
center 1 has contexts [0, 2, 3]
center 2 has contexts [1, 3]
center 3 has contexts [1, 2, 4]
center 4 has contexts [3]
center 8 has contexts [9]
center 9 has contexts [8]


In [10]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

### 负采样
- 常规的$softmax$会输出整个词汇表的概率分布，计算量巨大
- 使用负采样来进行近似训练：对于一对中心词和背景词，我们随机采样$K$个噪声词（实验中设$K=5$）。根据word2vec论文的建议，噪声词采样概率$P(w)$设为$w$词频与总词频之比的0.75次方。
- 采样之后，一个中性词对应一个背景词列表，还对应一个噪声词列表，噪声词列表的长度是背景词列表长度的$K$倍，此时$sofmax$输出为背景词+噪声词列表的概率分布

In [11]:
# 以词频的0.75次方作为权重进行采样
sampling_weights = [counter[w]**0.75 for w in idx2token]


def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 循环开始时，根据每个词的权重（sampling_weights）随机生成k个词的索引作为候选噪声词。
                # 为了高效计算，可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(population,
                                                      sampling_weights,
                                                      k=int(1e5))
            # 依次将候选噪声词添加到噪声词中
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives


all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [12]:
# 每个中心词对应的噪声词列表的长度，是背景词列表长度的𝐾倍
assert all([
    len(neg) == len(window) * 5
    for window, neg in zip(all_contexts, all_negatives)
])

### 数据管道
- 每个单词，对应一个**相同长度**的上下文词列表：上下文由背景词+噪声词+填充组成；
- masks 表明上下文中单词是否是填充；
- labels 表明上下文中的每个单词是不是背景词

In [19]:
def preprocess(centers, contexts, negatives):
    max_len = max(
        len(cont) + len(neg) for cont, neg in zip(contexts, negatives))
    targets, masks, labels = [], [], []
    for cont, neg in zip(contexts, negatives):
        cur_len = len(cont) + len(neg)

        # 用 0 填充成相同的长度
        targets.append(cont + neg + [0] * (max_len - cur_len))

        # 区别单词和填充
        masks.append([1] * cur_len + [0] * (max_len - cur_len))

        # 背景词才是标签，噪声词和填充都不是
        labels.append([1] * len(cont) + [0] * (max_len - len(cont)))
    centers = np.array(centers, dtype=np.float32).reshape(-1, 1)
    targets = np.array(targets, dtype=np.float32)
    masks = np.array(masks, dtype=np.float32)
    labels = np.array(labels, dtype=np.float32)
    return (centers, targets, masks, labels)

In [20]:
%%time
dataset = tf.data.Dataset.from_tensor_slices(preprocess(
    all_centers, all_contexts, all_negatives))

batch_size = 512
dataset = dataset.shuffle(len(all_centers)).batch(batch_size)

CPU times: user 4.82 s, sys: 136 ms, total: 4.95 s
Wall time: 4.95 s


In [21]:
for batch in dataset.take(1):
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'],
                          batch):
        print(name, 'shape:', data.shape)

centers shape: (512, 1)
contexts_negatives shape: (512, 60)
masks shape: (512, 60)
labels shape: (512, 60)


In [22]:
for batch in dataset.take(1):
    for name, data in zip(['centers', 'contexts_negatives', 'masks', 'labels'],
                          batch):
        print(name, 'shape:', data.dtype)

centers shape: <dtype: 'float32'>
contexts_negatives shape: <dtype: 'float32'>
masks shape: <dtype: 'float32'>
labels shape: <dtype: 'float32'>


### `skip-gram`模型
利用中心词预测(背景词+噪声词)的概率分布

In [23]:
# 嵌入层
embed = tf.keras.layers.Embedding(input_dim=6, output_dim=4)
embed.build(input_shape=(1, 6))
embed.get_weights()

[array([[ 0.00248704,  0.04588217,  0.03903292,  0.02340596],
        [ 0.01726497, -0.04898253,  0.03221582,  0.02792713],
        [-0.03564789,  0.04589545, -0.04923713, -0.00122647],
        [ 0.04300623,  0.03403026,  0.03677107,  0.02419565],
        [ 0.01740289, -0.03229471,  0.00015046,  0.00863054],
        [ 0.04152843,  0.04772225,  0.02492184,  0.01654983]],
       dtype=float32)]

In [24]:
x = tf.constant([[1, 2, 3], [4, 5, 0]], dtype=tf.int32)
embed(x)

<tf.Tensor: id=77, shape=(2, 3, 4), dtype=float32, numpy=
array([[[ 0.01726497, -0.04898253,  0.03221582,  0.02792713],
        [-0.03564789,  0.04589545, -0.04923713, -0.00122647],
        [ 0.04300623,  0.03403026,  0.03677107,  0.02419565]],

       [[ 0.01740289, -0.03229471,  0.00015046,  0.00863054],
        [ 0.04152843,  0.04772225,  0.02492184,  0.01654983],
        [ 0.00248704,  0.04588217,  0.03903292,  0.02340596]]],
      dtype=float32)>

In [25]:
# 跳字模型，批量数据，输出权重向量
@tf.function
def skip_gram(centers, contexts, embed_v, embed_u):
    v = embed_v(centers)
    u = embed_u(contexts)
    pred = tf.matmul(v, tf.transpose(u, perm=[0, 2, 1]))
    return pred

### 训练模型

#### 损失函数
负采样后的数据中引入了`mask`，填充的数据在训练时应该被遮蔽掉，对应的损失不应该计入模型损失函数。损失函数可以使用二元交叉熵损失函数，下面定义`SigmoidBinaryCrossEntropyLoss`:`x = logits`, `z = labels`

$$loss = z\times(-log(sigmoid(x))) + (1 - z)\times(-log(1 - sigmoid(x)))$$

In [26]:
class SigmoidBinaryCrossEntropy(tf.keras.losses.Loss):
    def __init__(self):
        super(SigmoidBinaryCrossEntropy, self).__init__()

    def __call__(self, labels, logits, mask=None):
        res = tf.nn.sigmoid_cross_entropy_with_logits(labels, logits) * mask
        return tf.reduce_mean(res, axis=1)


loss = SigmoidBinaryCrossEntropy()

In [27]:
pred = tf.constant([[1.5, 0.3, -1, 2], [1.1, -0.6, 2.2, 0.4]],
                   dtype=tf.float32)

# 标签变量label中的1和0分别代表背景词和噪声词
label = tf.constant([[1, 0, 0, 0], [1, 1, 0, 0]], dtype=tf.float32)

# 掩码变量
mask = tf.constant([[1, 1, 1, 1], [1, 1, 1, 0]], dtype=tf.float32)

loss(label, pred, mask) * mask.shape[1] / tf.reduce_sum(mask, axis=1)

<tf.Tensor: id=98, shape=(2,), dtype=float32, numpy=array([0.8739896, 1.2099689], dtype=float32)>

#### 训练模型

In [28]:
# 第一个嵌入层为需要的词向量，第二个嵌入层丢弃
embed_size = 100
net = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(idx2token),
                              output_dim=embed_size,
                              name='word2vec'),
    tf.keras.layers.Embedding(input_dim=len(idx2token),
                              output_dim=embed_size,
                              name='output')
])
net.get_layer(name='word2vec')

<tensorflow.python.keras.layers.embeddings.Embedding at 0x7fd5165667d0>

In [29]:
# 训练过程
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

def train(num_epochs):
    for epoch in range(num_epochs):
        start, cur_loss, n = time.time(), 0.0, 0
        for center, target, mask, label in dataset:
            with tf.GradientTape(persistent=True) as tape:
                pred = skip_gram(center, target, net.get_layer(index=0),
                                 net.get_layer(index=1))
                # 使用掩码变量mask来避免填充项对损失函数计算的影响
                l = (loss(label, tf.reshape(pred, label.shape), mask) *
                     mask.shape[1] / tf.reduce_sum(mask, axis=1))
                l = tf.reduce_mean(l)  # 一个batch的平均loss

            grads = tape.gradient(l, net.variables)
            optimizer.apply_gradients(zip(grads, net.variables))
            cur_loss += l.numpy().item()
            n += 1
        print('epoch %d, loss %.2f, time %.2fs' %
              (epoch + 1, cur_loss / n, time.time() - start))

In [30]:
train(10)

epoch 1, loss 0.45, time 21.91s
epoch 2, loss 0.39, time 21.23s
epoch 3, loss 0.35, time 21.08s
epoch 4, loss 0.32, time 20.95s
epoch 5, loss 0.31, time 20.93s
epoch 6, loss 0.30, time 20.90s
epoch 7, loss 0.30, time 20.95s
epoch 8, loss 0.29, time 20.91s
epoch 9, loss 0.29, time 21.21s
epoch 10, loss 0.28, time 20.91s


#### 验证模型

In [37]:
def get_similar_tokens(query_token, k, embed):
    W = embed.get_weights()
    W = tf.convert_to_tensor(W[0])
    x = W[token2idx[query_token]]
    x = tf.reshape(x, shape=[-1, 1])
    # 添加的1e-9是为了数值稳定性
    cos = tf.reshape(tf.matmul(W, x), shape=[
        -1
    ]) / tf.sqrt(tf.reduce_sum(W * W, axis=1) * tf.reduce_sum(x * x) + 1e-9)
    _, topk = tf.math.top_k(cos, k=k + 1)
    topk = topk.numpy().tolist()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (idx2token[i])))


get_similar_tokens('chip', 5, net.get_layer(index=0))

cosine sim=0.534: microprocessors
cosine sim=0.505: dell
cosine sim=0.498: micro
cosine sim=0.457: folk
cosine sim=0.447: intel
