In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

from process_data import process_data

In [None]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

def word2vec(batch_gen):
    """ Build the graph for word2vec model and train it """
    # Step 1: define the placeholders for input and output
    # center_words have to be int to work on embedding lookup

    with tf.variable_scope("data"):
        train_input = tf.placeholder(tf.int32, [BATCH_SIZE])
        train_labels = tf.placeholder(tf.int32, [BATCH_SIZE, 1])

    # Step 2: define weights. In word2vec, it's actually the weights that we care about
    # vocab size x embed size
    # initialized to random uniform -1 to 1

    with tf.variable_scope("inference"):
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))

    # Step 3: define the inference
    # get the embed of input words using tf.nn.embedding_lookup
    # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')

        embed = tf.nn.embedding_lookup(embeddings, train_input)

    # Step 4: construct variables for NCE loss
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
    # bias: vocab size, initialized to 0

    with tf.variable_scope("loss"):
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / (EMBED_SIZE ** 0.5)))
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))

    # define loss function to be NCE loss function
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # need to get the mean accross the batch

        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                      biases=nce_bias,
                                      labels=train_labels,
                                      inputs=embed,
                                      num_sampled=NUM_SAMPLED,
                                      num_classes=VOCAB_SIZE))

    # Step 5: define optimizer
    
    with tf.variable_scope("optimizer"):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

    with tf.Session() as sess:
        
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('./my_graph/no_frills/', sess.graph)
        for index in range(NUM_TRAIN_STEPS):
            centers, targets = next(batch_gen)
            # TO DO: create feed_dict, run optimizer, fetch loss_batch
            feed_dict = {train_input: centers, train_labels: targets}
            _, loss_batch = sess.run([optimizer, loss], feed_dict=feed_dict)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
        writer.close()

In [None]:
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
word2vec(batch_gen)

In [17]:
import os

class SkipGramModel:
    """ Build the graph for word2vec model """
    def __init__(self, **kwargs):
        self.vocab_size = kwargs.get('vocab_size', 50000)
        self.batch_size = kwargs.get('batch_size', 128)
        self.embed_size = kwargs.get('embed_size', 128)
        self.skip_window = kwargs.get('skip_window', 1)
        self.num_sampled = kwargs.get('num_sampled', 64)
        self.learning_rate = kwargs.get('learning_rate', 1.0)
        self.num_train_steps = kwargs.get('num_train_steps', 10000)
        self.skip_step = kwargs.get('skip_step', 2000)
        
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
    
    def _create_placeholders(self):
        """ Step 1: define the placeholders for input and output """
        with tf.variable_scope("data"):
            self.train_input = tf.placeholder(tf.int32, [self.batch_size])
            self.train_labels = tf.placeholder(tf.int32, [self.batch_size, 1])
            
    def _create_embedding(self):
        """ Step 2: define weights. In word2vec, it's actually the weights that we care about """
        with tf.variable_scope("inference"):
            self.embeddings = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0))
    
    def _create_loss(self):
        """ Step 3 + 4: define the inference + the loss function """
        with tf.variable_scope("loss"):
            embed = tf.nn.embedding_lookup(self.embeddings, self.train_input)
            nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size], stddev=1.0 / (self.embed_size ** 0.5)))
            nce_bias = tf.Variable(tf.zeros([self.vocab_size]))
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                      biases=nce_bias,
                                      labels=self.train_labels,
                                      inputs=embed,
                                      num_sampled=self.num_sampled,
                                      num_classes=self.vocab_size))
    
    def _create_optimizer(self):
        """ Step 5: define optimizer """
        with tf.variable_scope("optimizer"):
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(self.loss)
    
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("embedding", self.embeddings)
            # merge them all
            self.summary_op = tf.summary.merge_all()
    
    def visualize_embeddings(self, sess, writer, logdir):
        config = projector.ProjectorConfig()
        final_embed_matrix = sess.run(self.embeddings)
        
        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix[:1000], name='embedding')
        sess.run(embedding_var.initializer)

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = 'processed/vocab_1000.tsv'

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, logdir + 'embed.ckpt', 1)
    
    def fit(self, batch_gen):
        logdir = './my_graph/lr{}/'.format(self.learning_rate)
        
        saver = tf.train.Saver()
        with tf.Session() as sess:
        
            init_op = tf.global_variables_initializer()
            sess.run(init_op)

            total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps
            writer = tf.summary.FileWriter(logdir, sess.graph)
            for index in range(self.num_train_steps):
                centers, targets = next(batch_gen)
                # TO DO: create feed_dict, run optimizer, fetch loss_batch
                feed_dict = {self.train_input: centers, self.train_labels: targets}
                _, loss_batch = sess.run([self.optimizer, self.loss], feed_dict=feed_dict)
                total_loss += loss_batch
                if (index + 1) % self.skip_step == 0:
                    summary = sess.run(self.summary_op, feed_dict=feed_dict)
                    writer.add_summary(summary, global_step=index)
                    saver.save(sess, logdir + "model.ckpt", global_step=index)
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / self.skip_step))
                    total_loss = 0.0
            self.visualize_embeddings(sess, writer, logdir)
            writer.close()

In [18]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
SKIP_WINDOW = 1

tf.reset_default_graph()
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
model = SkipGramModel(vocab_size=VOCAB_SIZE, batch_gen=BATCH_SIZE, skip_window=SKIP_WINDOW, learning_rate=0.5)
model.fit(batch_gen)

Dataset ready
Average loss at step 1999: 114.0
Average loss at step 3999:  52.7
Average loss at step 5999:  33.4
Average loss at step 7999:  23.6
Average loss at step 9999:  17.6


NameError: name 'summary_writer' is not defined

In [None]:
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)