### 基于Tensorflow实现word2vec

In [18]:
import collections
import os
import random
import urllib
import zipfile

import numpy as np
import tensorflow as tf

In [19]:
# 训练参数
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000

# 测试样例
eval_words = ['nine', 'of', 'going', 'hardware', 'american', 'britain']

# Word2Vec 参数
embedding_size = 200 # 词向量维度
max_vocabulary_size = 50000 # 语料库词语数
min_occurrence = 10 # 最小词频
skip_window = 3 # 左右窗口大小
num_skips = 2 # 一次制作多少个输入输出对
num_sampled = 64 # 负采样

In [20]:
# 加载训练数据，其实什么数据都行
data_path = 'text8.zip'
with zipfile.ZipFile(data_path) as f:
    text_words = f.read(f.namelist()[0]).lower().split()

In [21]:
len(text_words)

17005207

In [22]:
# 创建一个计数器，计算每个词出现了多少次
count = [('UNK', -1)]
# 基于词频返回max_vocabulary_size个常用词
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))

默认已经是排序好了的

In [23]:
count[0:10]

[('UNK', -1),
 (b'the', 1061396),
 (b'of', 593677),
 (b'and', 416629),
 (b'one', 411764),
 (b'in', 372201),
 (b'a', 325873),
 (b'to', 316376),
 (b'zero', 264975),
 (b'nine', 250430)]

别忘了，咱们还设置了min_occurrence参数，需要判断每一个词是否满足给定条件

In [24]:
# 剔除掉出现次数少于'min_occurrence'的词
for i in range(len(count) - 1, -1, -1):# 从start到end每次step多少
    if count[i][1] < min_occurrence:
        count.pop(i)
    else:
        # 判断时，从小到大排序的，所以跳出时候剩下的都是满足条件的
        break

### 词-ID映射

In [25]:
# 计算语料库大小
vocabulary_size = len(count)
# 每个词都分配一个ID
word2id = dict()
for i, (word, _)in enumerate(count):
    word2id[word] = i

In [26]:
word2id

{'UNK': 0,
 b'the': 1,
 b'of': 2,
 b'and': 3,
 b'one': 4,
 b'in': 5,
 b'a': 6,
 b'to': 7,
 b'zero': 8,
 b'nine': 9,
 b'two': 10,
 b'is': 11,
 b'as': 12,
 b'eight': 13,
 b'for': 14,
 b's': 15,
 b'five': 16,
 b'three': 17,
 b'was': 18,
 b'by': 19,
 b'that': 20,
 b'four': 21,
 b'six': 22,
 b'seven': 23,
 b'with': 24,
 b'on': 25,
 b'are': 26,
 b'it': 27,
 b'from': 28,
 b'or': 29,
 b'his': 30,
 b'an': 31,
 b'be': 32,
 b'this': 33,
 b'which': 34,
 b'at': 35,
 b'he': 36,
 b'also': 37,
 b'not': 38,
 b'have': 39,
 b'were': 40,
 b'has': 41,
 b'but': 42,
 b'other': 43,
 b'their': 44,
 b'its': 45,
 b'first': 46,
 b'they': 47,
 b'some': 48,
 b'had': 49,
 b'all': 50,
 b'more': 51,
 b'most': 52,
 b'can': 53,
 b'been': 54,
 b'such': 55,
 b'many': 56,
 b'who': 57,
 b'new': 58,
 b'used': 59,
 b'there': 60,
 b'after': 61,
 b'when': 62,
 b'into': 63,
 b'american': 64,
 b'time': 65,
 b'these': 66,
 b'only': 67,
 b'see': 68,
 b'may': 69,
 b'than': 70,
 b'world': 71,
 b'i': 72,
 b'b': 73,
 b'would': 74,
 b'd

### 所有词转换成ID

In [27]:
data = list()
unk_count = 0
for word in text_words:
    # 全部转换成id
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])

Words count: 17005207
Unique words: 253854
Vocabulary size: 47135
Most common words: [('UNK', 444176), (b'the', 1061396), (b'of', 593677), (b'and', 416629), (b'one', 411764), (b'in', 372201), (b'a', 325873), (b'to', 316376), (b'zero', 264975), (b'nine', 250430)]


### 构建所需训练数据

In [35]:
data_index = 0

def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    # get window size (words left and right + current one).
    span = 2 * skip_window + 1 #7为窗口，左3右3中间1
    buffer = collections.deque(maxlen=span)#创建一个长度为7的队列
    if data_index + span > len(data):#如果数据被滑完一遍了
        data_index = 0
    buffer.extend(data[data_index:data_index + span])#队列里存的是当前窗口，例如deque([5234, 3081, 12, 6, 195, 2, 3134], maxlen=7)
    data_index += span
    for i in range(batch_size // num_skips):#num_skips表示取多少组不同的词作为输出，此例为2
        context_words = [w for w in range(span) if w != skip_window]#上下文就是[0, 1, 2, 4, 5, 6]
        words_to_use = random.sample(context_words, num_skips)#在上下文里随机选2个候选词
        for j, context_word in enumerate(words_to_use):#遍历每一个候选词，用其当做输出也就是标签
            batch[i * num_skips + j] = buffer[skip_window]#输入都为当前窗口的中间词，即3
            labels[i * num_skips + j, 0] = buffer[context_word]#用当前候选词当做标签
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])#之前已经传入7个词了，窗口要右移了，例如原来为[5234, 3081, 12, 6, 195, 2, 3134]，现在为[3081, 12, 6, 195, 2, 3134, 46]
            data_index += 1

    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [29]:
with tf.device('/cpu:0'):   
    embedding = tf.Variable(tf.random.normal([vocabulary_size, embedding_size])) #维度：47135, 200
    nce_weights = tf.Variable(tf.random.normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

### 通过tf.nn.embedding_lookup函数将索引转换成词向量

In [30]:
def get_embedding(x):
    with tf.device('/cpu:0'):
        
        x_embed = tf.nn.embedding_lookup(embedding, x)
        return x_embed

### 损失函数定义
- 先分别计算出正样本和采样出的负样本对应的output和label
- 再通过 sigmoid cross entropy来计算output和label的loss

In [31]:
def nce_loss(x_embed, y):
    with tf.device('/cpu:0'):
        y = tf.cast(y, tf.int64)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=y,
                           inputs=x_embed,
                           num_sampled=num_sampled,#采样出多少个负样本
                           num_classes=vocabulary_size))
        return loss

### 测试观察模块

In [32]:
# Evaluation.
def evaluate(x_embed):
    with tf.device('/cpu:0'):
        # Compute the cosine similarity between input data embedding and every embedding vectors
        x_embed = tf.cast(x_embed, tf.float32)
        x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))#归一化
        embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)#全部向量的
        cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)#计算余弦相似度
        return cosine_sim_op

# SGD
optimizer = tf.optimizers.SGD(learning_rate)

In [33]:
# 迭代优化
def run_optimization(x, y):
    with tf.device('/cpu:0'):
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)

        # 计算梯度
        gradients = g.gradient(loss, [embedding, nce_weights, nce_biases])

        # 更新
        optimizer.apply_gradients(zip(gradients, [embedding, nce_weights, nce_biases]))

In [36]:
# 待测试的几个词
x_test = np.array([word2id[w.encode('utf-8')] for w in eval_words])

# 训练
for step in range(1, num_steps + 1):
    batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
    run_optimization(batch_x, batch_y)
    
    if step % display_step == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x), batch_y)
        print("step: %i, loss: %f" % (step, loss))
        
    # Evaluation.
    if step % eval_step == 0 or step == 1:
        print("Evaluation...")
        sim = evaluate(get_embedding(x_test)).numpy()
        for i in range(len(eval_words)):
            top_k = 8  # 返回前8个最相似的
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = '"%s" nearest neighbors:' % eval_words[i]
            for k in range(top_k):
                log_str = '%s %s,' % (log_str, id2word[nearest[k]])
            print(log_str)

step: 1, loss: 518.769836
Evaluation...
"nine" nearest neighbors: b'duckworth', b'dyson', b'colspan', b'transporting', b'johns', b'metastability', b'webelements', b'cashier',
"of" nearest neighbors: b'antiderivative', b'diagnostics', b'reduces', b'lucrative', b'oils', b'astray', b'rosicrucians', b'dependable',
"going" nearest neighbors: b'enrolled', b'indecisive', b'overfishing', b'partake', b'giambattista', b'foci', b'descendant', b'mcnally',
"hardware" nearest neighbors: b'pickled', b'hein', b'tome', b'subclasses', b'cuyp', b'embryo', b'atkinson', b'murmansk',
"american" nearest neighbors: b'schwann', b'bassoons', b'macedonia', b'doubled', b'shattered', b'hide', b'floating', b'rader',
"britain" nearest neighbors: b'fennel', b'childless', b'traffic', b'survivals', b'herbivorous', b'mammalia', b'informational', b'bastille',
step: 10000, loss: 88.785706
step: 20000, loss: 63.120361
step: 30000, loss: 48.168465
step: 40000, loss: 27.085285
step: 50000, loss: 40.907837
step: 60000, loss: 

step: 1240000, loss: 6.008477
step: 1250000, loss: 5.636484
step: 1260000, loss: 6.839955
step: 1270000, loss: 5.529184
step: 1280000, loss: 7.769947
step: 1290000, loss: 5.620362
step: 1300000, loss: 7.210409
step: 1310000, loss: 5.749698
step: 1320000, loss: 5.996649
step: 1330000, loss: 5.485536
step: 1340000, loss: 6.106355
step: 1350000, loss: 6.140479
step: 1360000, loss: 8.180002
step: 1370000, loss: 5.219302
step: 1380000, loss: 5.387230
step: 1390000, loss: 5.244849
step: 1400000, loss: 7.595073
Evaluation...
"nine" nearest neighbors: b'eight', b'seven', b'six', b'four', b'five', b'three', b'one', b'two',
"of" nearest neighbors: b'and', b'the', b'in', b'including', b'with', b'while', b'its', b'modern',
"going" nearest neighbors: b'how', b'little', b'them', b'way', b'without', b'good', b'a', b'will',
"hardware" nearest neighbors: b'information', b'particular', b'development', b'usually', b'all', b'then', b'developed', b'without',
"american" nearest neighbors: b'english', b'fren

step: 2610000, loss: 8.241434
step: 2620000, loss: 5.883458
step: 2630000, loss: 5.183064
step: 2640000, loss: 4.867446
step: 2650000, loss: 5.183759
step: 2660000, loss: 7.073436
step: 2670000, loss: 5.824453
step: 2680000, loss: 5.526841
step: 2690000, loss: 6.368338
step: 2700000, loss: 6.172505
step: 2710000, loss: 5.272530
step: 2720000, loss: 5.366050
step: 2730000, loss: 5.566924
step: 2740000, loss: 6.107131
step: 2750000, loss: 4.969476
step: 2760000, loss: 5.911208
step: 2770000, loss: 6.151473
step: 2780000, loss: 5.911109
step: 2790000, loss: 4.900894
step: 2800000, loss: 5.164389
Evaluation...
"nine" nearest neighbors: b'eight', b'seven', b'six', b'five', b'four', b'three', b'one', b'two',
"of" nearest neighbors: b'the', b'in', b'and', b'under', b'following', b'including', b'from', b'part',
"going" nearest neighbors: b'little', b'put', b'so', b'started', b'continued', b'without', b'once', b'our',
"hardware" nearest neighbors: b'system', b'program', b'source', b'systems', b