所有参考的资料，下次再看的

[自己动手写word2vec (一):主要概念和流程](http://blog.csdn.net/u014595019/article/details/51884529)

[word2vec 中的数学原理详解（一）目录和前言](http://blog.csdn.net/itplus/article/details/37969519)

[Pycon 2016 tensorflow 研讨会总结 — tensorflow 手把手入门 #第二讲 word2vec](http://nooverfit.com/wp/pycon-2016-tensorflow-%E7%A0%94%E8%AE%A8%E4%BC%9A%E6%80%BB%E7%BB%93-tensorflow-%E6%89%8B%E6%8A%8A%E6%89%8B%E5%85%A5%E9%97%A8-%E7%AC%AC%E4%BA%8C%E8%AE%B2-word2vec/)

[视频：1 导读：word2vec中的数学原理之介绍](https://www.youtube.com/watch?v=HpryEHGKkpY)

[word2vec学习小记](http://ginobefunny.com/post/learning_word2vec/index.html)

[word2vec原理推导与代码分析](http://www.hankcs.com/nlp/word2vec.html)


In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE

本次目标是：基于skip-gram模型来训练一个Word2vec，使用的数据来源：[Text8](http://mattmahoney.net/dc/textdata)
## 拉取数据

In [3]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


下载完数据后，进行处理

In [22]:
# 把数据读入到一个列表中, 每个元素就是一个单词啦.
def read_data(filename):
  """Extract the first file enclosed in a zip file as a list of words"""
  with zipfile.ZipFile(filename) as f:
    data = tf.compat.as_str(f.read(f.namelist()[0])).split()
  return data
  
words = read_data(filename)
print('Data size %d' % len(words))

Data size 17005207


In [23]:
len(words) # 17005207 一千七百万
words[0] # 第一个单词是anarchism

'anarchism'

下一步我们将更进一步的处理我们的数据

In [9]:
vocabulary_size = 50000
# 第二步: 构造字典, 把非常稀少的单词替换为"UNK"(未知的单词标记).
def build_dataset(words):
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    if word in dictionary:
      index = dictionary[word]
    else:
      index = 0  # dictionary['UNK']
      unk_count = unk_count + 1
    data.append(index)
  count[0][1] = unk_count
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
  return data, count, dictionary, reverse_dictionary


# data 是一个list, 按照文章的单词顺序记录了每个单词在我们字典dictionary中的index, 即出现频率排名
# count 是所有单词的计数dict
# dictionary是每个单词的出现频率排名, key是单词, value是排名
# reverse_dictionary是dictionary的key-value颠倒, key是排名, value是单词
data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]
Sample data [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


In [24]:
# 我们出现的第一个单词是第5234名，单词是
reverse_dictionary[5234]

'anarchism'

In [25]:
# 这个单词其出现的次数是：
count[5234]

('anarchism', 303)

In [18]:
# 前8个出现的单词是：
print('data:', [reverse_dictionary[di] for di in data[:8]])

data: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first']


In [36]:
data_index = 0
# 第三步: 为skip-gram模型生成训练块的函数
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  for i in range(batch_size // num_skips):
    target = skip_window  # target label at the center of the buffer
    targets_to_avoid = [ skip_window ]
    for j in range(num_skips):
      while target in targets_to_avoid:
        target = random.randint(0, span - 1)
      targets_to_avoid.append(target)
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[target]
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

上面的函数定义的一个直观感受是：

上下文/目标文字组合, 双向窗口大小为1:

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first'] … →

([anarchism, as], originated), ([originated, a], as), ([as,
term], a),

输入/输出组合:

(originated, anarchism), (originated, as), (as, originated), (as,a), …

In [37]:
for num_skips, skip_window in [(2, 1), (4, 2)]:
    data_index = 0
    batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
    print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])


with num_skips = 2 and skip_window = 1:
    batch: ['originated', 'originated', 'as', 'as', 'a', 'a', 'term', 'term']
    labels: ['anarchism', 'as', 'originated', 'a', 'term', 'as', 'a', 'of']

with num_skips = 4 and skip_window = 2:
    batch: ['as', 'as', 'as', 'as', 'a', 'a', 'a', 'a']
    labels: ['originated', 'term', 'a', 'anarchism', 'of', 'as', 'originated', 'term']


In [None]:
# 第四步: 建立并训练skip-gram模型.
batch_size = 128
embedding_size = 128 # embedding向量的维数, 即隐层维数
skip_window = 1 # 向左和向右考虑的单词数, 即向左向右仅考虑一个单词.
num_skips = 2 # 可以重复使用输入去生成label的次数.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
# 我们随机生成集合抽样邻近单词,
# 这里我们选那些出现频率比较高的单词
valid_size = 16 # 评估相似性的单词随机集合.
valid_window = 100 # 在分布首部选择样本.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 64 #错分的样本

graph = tf.Graph()
# 如果没有GPU,就用CPU的选项
with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
  
  # Variables.
  # 在输入数据中寻找隐含层.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
  
  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))