Code adapted from [Stanford CS20SI](http://web.stanford.edu/class/cs20si/syllabus.html)

The code for word2vec is pretty short, 

In [None]:
import numpy as np
import tensorflow as tf
from process_data import process_data

VOCAB_SIZE = 50000 # Size constraint on our vocab
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window

# Step 1: define the placeholders for input and output
center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name='center_words')
target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name='target_words')

# Step 2: define weights. In word2vec, it's actually the weights that we care about
embed_matrix = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), 
                        name='embed_matrix')

# Step 3: define the inference
embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')

# Step 4: construct variables for NCE loss
nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE],
                                            stddev=1.0 / (EMBED_SIZE ** 0.5)), 
                                            name='nce_weight')
nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')

# define loss function to be NCE loss function
NUM_SAMPLED = 64 # Number of negative examples to sample.
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                    biases=nce_bias, 
                                    labels=target_words, 
                                    inputs=embed, 
                                    num_sampled=NUM_SAMPLED, 
                                    num_classes=VOCAB_SIZE), name='loss')

## Training and Results

In [None]:
# Setup Optimizer
LEARNING_RATE = 1.0
optimizer = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)

SKIP_STEP = 2000 # how many steps to skip before reporting the loss
BATCH_SIZE, NUM_TRAIN_STEPS = 128, 10000
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
    for index in range(NUM_TRAIN_STEPS):
        centers, targets = next(batch_gen)
        loss_batch, _ = sess.run([loss, optimizer], 
                                feed_dict={center_words: centers, target_words: targets})
        total_loss += loss_batch
        if (index + 1) % SKIP_STEP == 0:
            print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
            total_loss = 0.0