In [11]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector

from process_data import process_data

VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128
SKIP_WINDOW = 1
NUM_SAMPLED = 64
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000

tf.reset_default_graph()

def word2vec(batch_gen):
    with tf.name_scope('data'):
        # placeholders
        center_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE], name="center_words")
        target_words = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1], name="target_words")
    
    with tf.name_scope('embed'):
        # weights
        W = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0), name='W')
    
    with tf.name_scope('loss'):
        # inference
        embed = tf.nn.embedding_lookup(W, center_words, name='embed')

        # NCE weights
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE],
                                                    stddev=1.0 / (EMBED_SIZE ** 0.5)),
                                name='nce_weight')
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]), name='nce_bias')
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                             biases=nce_bias,
                                             labels=target_words,
                                             inputs=embed,
                                             num_sampled=NUM_SAMPLED,
                                             num_classes=VOCAB_SIZE))
    
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(loss)
    
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        
        total_loss = 0.0
        writer = tf.summary.FileWriter('./n4_log_dir', sess.graph)
        for index in xrange(NUM_TRAIN_STEPS):
            centers, targets = batch_gen.next()
            _, loss_batch = sess.run([optimizer, loss], feed_dict={center_words: centers, \
                                                                   target_words: targets})
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
        writer.close()
        
def main():
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    word2vec(batch_gen)
    
if __name__ == '__main__':
    main()

Dataset ready
Average loss at step 1999: 113.7
Average loss at step 3999:  52.6
Average loss at step 5999:  33.1
Average loss at step 7999:  23.6
Average loss at step 9999:  17.8


In [29]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector
import os

from process_data import process_data

tf.reset_default_graph()

VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128
SKIP_WINDOW = 1
NUM_SAMPLED = 64
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
WEIGHTS_FLD = './processed'
LOG_DIR = './n4_log_dir/'
SKIP_STEP = 2000

class SkipGramModel:
    def __init__(self, vocab_size, batch_size, embed_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.embed_size = embed_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    
    def _create_placeholders(self):
        with tf.name_scope('data'):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name="center_words")
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name="target_words")
    
    def _create_embedding(self):
        with tf.device('/cpu:0'):
            with tf.name_scope('embed'):
                self.W = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0), name='W')
    
    def _create_loss(self):
        with tf.device('/cpu:0'):
            with tf.name_scope('loss'):
                # inference
                self.embed = tf.nn.embedding_lookup(self.W, self.center_words, name='embed')

                # NCE weights
                self.nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                            stddev=1.0 / (self.embed_size ** 0.5)),
                                        name='nce_weight')
                self.nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name='nce_bias')
                self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=self.nce_weight,
                                                     biases=self.nce_bias,
                                                     labels=self.target_words,
                                                     inputs=self.embed,
                                                     num_sampled=self.num_sampled,
                                                     num_classes=self.vocab_size), name='loss')
    
    def _create_optimizer(self):
        with tf.device('/cpu:0'):
            self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.lr).minimize(self.loss)

    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            self.summary_op = tf.summary.merge_all()
            
    def build_graph(self):
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()
        
def train_model(model, batch_gen, num_train_steps, weights_fld):
    saver = tf.train.Saver()
    
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        
        initial_step = 0
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./checkpoints/checkpoint'))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            
        total_loss = 0.0
        
        # Remove previous events files
        for the_file in os.listdir(LOG_DIR):
            file_path = os.path.join(LOG_DIR, the_file)
            if os.path.isfile(file_path) and the_file[:6] == "events":
                os.unlink(file_path)

        writer = tf.summary.FileWriter('./n4_log_dir', sess.graph)
        for index in xrange(initial_step, initial_step + num_train_steps):
            centers, targets = batch_gen.next()
            feed_dict={model.center_words: centers, model.target_words: targets}
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],
                                             feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if (index) % SKIP_STEP == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/skip-gram', index)
                
        writer.close()
        
        final_embed_matrix = sess.run(model.W)

        embedding_var = tf.Variable(final_embed_matrix[:500], name='embedding')
        sess.run(embedding_var.initializer)
        config = projector.ProjectorConfig()
        
        summary_writer = tf.summary.FileWriter(LOG_DIR)
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        embedding.metadata_path = os.path.join(LOG_DIR, 'skip-gram')

        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, embedding.metadata_path, 1)
    
    
def main():
    model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE)
    model.build_graph()
    batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
    train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)
    

if __name__ == '__main__':
    main()

Dataset ready
Average loss at step 0:   0.0
Average loss at step 2000:   4.4
Average loss at step 4000:   4.6
Average loss at step 6000:   4.5
Average loss at step 8000:   4.5
