### Dependencies

In [12]:
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
import os
import collections
import math
import numpy as np
import time

fname = 'word_embedding'

### Helpers

In [13]:
def make_dicts(words, vocabulary_size):
  most_common = collections.Counter(words).most_common(vocabulary_size - 1)
  most_common.insert(0, ('<unk>', None))
  
  assert len(most_common) == vocabulary_size
  
  word2id = dict()
  id2word = dict()
  
  for i, (word, _count) in enumerate(most_common):
    word2id[word] = i
    id2word[i] = word
    
  ids = list(map(lambda word: word2id.get(word, word2id['<unk>']), words))

  return ids, word2id, id2word

def skip_gram(i, words, skip_size):
  left = max(0, i - skip_size)
  right = min(len(words), i + 1 + skip_size)
  context_words = words[left:i] + words[i + 1:right] 
  return np.random.choice(context_words)

### Load dataset

In [14]:
dataset_fname = os.path.expanduser('~/Datasets/text8')

with open(dataset_fname) as f:
  words = f.read().split()
  
print('%d words, first %d: %s' % (len(words), 5, words[:5]))

17005207 words, first 5: ['anarchism', 'originated', 'as', 'a', 'term']


### Preprocess dataset

In [15]:
vocabulary_size = 5 * 10000 
skip_size = 2
ids, word2id, id2word = make_dicts(words, vocabulary_size)

print('words: %s' % words[:10])
print('ids: %s' % ids[:10])

assert [id2word[word2id[words[i]]] for i in range(10)] == words[:10]

targets = [skip_gram(i, ids, skip_size) for i in range(len(ids))]

words: ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
ids: [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]


### Build iterator

In [46]:
batch_size = 256

tf.reset_default_graph()

with tf.name_scope('input_pipeline'):
  x_placeholder = tf.placeholder(tf.int32, [None], name='x_placeholder')
  y_placeholder = tf.placeholder(tf.int32, [None], name='y_placeholder')
  dataset = (tf.contrib.data.Dataset.from_tensor_slices((x_placeholder, y_placeholder))
             .repeat()
             .map(lambda x, y: (x, tf.reshape(y, [1])))
             .shuffle(1000)
             .batch(batch_size))

  iterator = dataset.make_initializable_iterator()
  x, y_ = iterator.get_next()

  # dataset = (tf.contrib.data.Dataset.from_tensor_slices((ids, targets))
  #            .map(lambda x, y: (x, tf.reshape(y, [1])))
  #            .shuffle(1000)
  #            .batch(batch_size))

  # iterator = dataset.make_one_shot_iterator()
  # x, y_ = iterator.get_next()

### Build a graph

In [47]:
embedding_size = 256
num_sampled = 64
learning_rate_initial = 1.0

global_step = tf.Variable(0, name='global_step', trainable=False)
# learning_rate = tf.train.exponential_decay(
#     learning_rate_initial,
#     global_step,
#     10000, 
#     0.96,
#     staircase=True,
#     name='learning_rate')
learning_rate = tf.constant(1.0, name='learning_rate')

x = tf.identity(x, name='x')
y_ = tf.identity(y_, name='y_')

embeddings = tf.Variable(
  tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), 
  name='word_embeddings')

gathered = tf.nn.embedding_lookup(embeddings, x)

w = tf.Variable(
  tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)), 
  name='w')

b = tf.Variable(
  tf.zeros([vocabulary_size]), 
  name='b')

with tf.name_scope('loss'):
  loss = tf.nn.sampled_softmax_loss(weights=w,
                                    biases=b,
                                    labels=y_,
                                    inputs=gathered,
                                    num_sampled=num_sampled,
                                    num_classes=vocabulary_size)
  loss = tf.reduce_mean(loss)
  
train = tf.train.AdagradOptimizer(learning_rate=learning_rate).minimize(loss, global_step=global_step)
# train = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss, global_step=global_step)
# train = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss, global_step=global_step)

init = tf.global_variables_initializer()

with tf.name_scope('summary'):
  tf.summary.scalar('loss', loss)
  tf.summary.scalar('learning_rate', learning_rate)
  
merged = tf.summary.merge_all()
saver = tf.train.Saver()

### Train

In [None]:
log_dir = '/tmp/tf_log/%s' % fname
model_path = os.path.join(log_dir , 'model.ckpt')
restore = True
log_interval = 1000
save_interval = log_interval * 5
steps = 500001

config = projector.ProjectorConfig()
emb_conf = config.embeddings.add()
emb_conf.tensor_name = embeddings.name
emb_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')

with tf.Session() as sess:
  writer = tf.summary.FileWriter(log_dir, sess.graph)
  
  with open(emb_conf.metadata_path, 'w') as f:
    for id in id2word:
      print(id2word[id], file=f)
  
  if restore:
    saver.restore(sess, model_path)
  else:
    sess.run(init)

  t = time.time()
  sess.run(iterator.initializer, {x_placeholder: ids, y_placeholder: targets})
  print('init: %f' % (time.time() - t))
  
  for i in range(sess.run(global_step), steps):
    sess.run(train)
    
    if i % log_interval == 0:
      l, lr, summary = sess.run([loss, learning_rate, merged])
      print('iteration: %d, loss: %f, learning_rate: %f' % (i, l, lr))
      
      projector.visualize_embeddings(writer, config)
      writer.add_summary(summary, i)
      writer.flush()
    
    if i > 0 and i % save_interval == 0:
      save_path = saver.save(sess, model_path)
      print('model saved: %s' % save_path)
    
  writer.close()

init: 3.348607
iteration: 0, loss: 7.679859, learning_rate: 1.000000
