### Dependencies

In [10]:
import tensorflow as tf
import os
import collections
import math
import numpy as np
import time

fname = 'word_embedding'

### Helpers

In [2]:
def make_dicts(words, vocabulary_size):
  most_common = collections.Counter(words).most_common(vocabulary_size - 1)
  most_common.insert(0, ('<unk>', None))
  
  assert len(most_common) == vocabulary_size
  
  word2id = dict()
  id2word = dict()
  
  for i, (word, _count) in enumerate(most_common):
    word2id[word] = i
    id2word[i] = word
    
  ids = list(map(lambda word: word2id.get(word, word2id['<unk>']), words))

  return ids, word2id, id2word

def skip_gram(i, words, skip_size):
  left = max(0, i - skip_size)
  right = min(len(words), i + 1 + skip_size)
  context_words = words[left:i] + words[i + 1:right] 
  return np.random.choice(context_words)

### Load dataset

In [3]:
dataset_fname = os.path.expanduser('~/Datasets/text8')

with open(dataset_fname) as f:
  words = f.read().split()
  
print('%d words, first %d: %s' % (len(words), 5, words[:5]))

17005207 words, first 5: ['anarchism', 'originated', 'as', 'a', 'term']


### Preprocess dataset

In [4]:
vocabulary_size = 10000 
skip_size = 2
ids, word2id, id2word = make_dicts(words, vocabulary_size)

print('words: %s' % words[:10])
print('ids: %s' % ids[:10])

assert [id2word[word2id[words[i]]] for i in range(10)] == words[:10]

targets = [skip_gram(i, ids, skip_size) for i in range(len(ids))]

words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
ids: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


### Build iterator

In [22]:
batch_size = 32

tf.reset_default_graph()

with tf.name_scope('input_pipeline'):
  x_placeholder = tf.placeholder(tf.int32, [None], name='x_placeholder')
  y_placeholder = tf.placeholder(tf.int32, [None], name='y_placeholder')
  dataset = (tf.contrib.data.Dataset.from_tensor_slices((x_placeholder, y_placeholder))
             .map(lambda x, y: (x, tf.reshape(y, [1])))
             .shuffle(1000)
             .batch(batch_size))

  iterator = dataset.make_initializable_iterator()
  x, y_ = iterator.get_next()

  # dataset = (tf.contrib.data.Dataset.from_tensor_slices((ids, targets))
  #            .map(lambda x, y: (x, tf.reshape(y, [1])))
  #            .shuffle(1000)
  #            .batch(batch_size))

  # iterator = dataset.make_one_shot_iterator()
  # x, y_ = iterator.get_next()

### Build a graph

In [23]:
embedding_size = 300
num_sampled = 64
learning_rate_initial = 1.0

global_step = tf.Variable(0, name='global_step', trainable=False)
learning_rate = tf.train.exponential_decay(
    learning_rate_initial,
    global_step,
    1000, 
    0.96,
    # staircase=False,
    name='learning_rate')

x = tf.identity(x, name='x')
y_ = tf.identity(y_, name='y_')

embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), 
                         name='word_embeddings')

gathered = tf.nn.embedding_lookup(embeddings, x)
w = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), name='w')
b = tf.Variable(tf.zeros([vocabulary_size]), name='b')
# y = tf.matmul(gathered, w) + b

with tf.name_scope('loss'):
  loss = tf.nn.sampled_softmax_loss(weights=w,
                                    biases=b,
                                    labels=y_,
                                    inputs=gathered,
                                    num_sampled=num_sampled,
                                    num_classes=vocabulary_size)
  loss = tf.reduce_mean(loss)
  
train = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss, global_step=global_step)

init = tf.group(
  tf.global_variables_initializer(), 
  iterator.initializer, 
  name='init')

with tf.name_scope('summary'):
  tf.summary.scalar('loss', loss)
  tf.summary.scalar('learning_rate', learning_rate)
  
merged = tf.summary.merge_all()

### Train

In [24]:
log_dir = '/tmp/tf_log/%s' % fname
model_path = 'model/%s.ckpt' % fname
restore = False
log_interval = 200

saver = tf.train.Saver()

with tf.Session() as sess:
  if restore:
    saver.restore(sess, model_path)
  
  writer = tf.summary.FileWriter(log_dir, sess.graph)
  
  t = time.time()
  sess.run(init, {x_placeholder: ids, y_placeholder: targets})
  print('init: %f' % (time.time() - t))
  
  for i in range(10000):
    sess.run(train)
    
    if i % log_interval == 0:
      l, lr, summary = sess.run([loss, learning_rate, merged])
      print('iteration: %d, loss: %f, learning_rate: %f' % (i, l, lr))
      writer.add_summary(summary, i)
      writer.flush()
    
  writer.close()

init: 3.469874
iteration: 0, loss: 6.521284, learning_rate: 0.999959
iteration: 200, loss: 4.721942, learning_rate: 0.991828
iteration: 400, loss: 5.181370, learning_rate: 0.983764
iteration: 600, loss: 4.446598, learning_rate: 0.975764
iteration: 800, loss: 4.555428, learning_rate: 0.967830
iteration: 1000, loss: 3.190428, learning_rate: 0.959961
iteration: 1200, loss: 4.875598, learning_rate: 0.952155
iteration: 1400, loss: 4.258330, learning_rate: 0.944413
iteration: 1600, loss: 3.729623, learning_rate: 0.936734
iteration: 1800, loss: 4.379132, learning_rate: 0.929117
iteration: 2000, loss: 3.759236, learning_rate: 0.921562
iteration: 2200, loss: 4.315496, learning_rate: 0.914069
iteration: 2400, loss: 3.830971, learning_rate: 0.906637
iteration: 2600, loss: 4.420321, learning_rate: 0.899265
iteration: 2800, loss: 4.419066, learning_rate: 0.891952
iteration: 3000, loss: 4.227913, learning_rate: 0.884700
iteration: 3200, loss: 4.039743, learning_rate: 0.877506
iteration: 3400, loss: 