# Credits
https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/examples/word2vec_utils.py  
https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/examples/04_word2vec.py   
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py  


In [1]:
import os 
import urllib
import tensorflow as tf
URL = 'http://mattmahoney.net/dc/text8.zip'

def download_data(local_file, expected_bytes=None, url=URL):
    if os.path.exists(local_file):
        print('%s already exists' % local_file)
    else:
        local_file, _ = urllib.request.urlretrieve(url, local_file)    
        statinfo = os.stat(local_file)
        if expected_bytes or statinfo.st_size == expected_bytes:
            print('Downloaded file: %s has the expected size : %s' % (local_file, expected_bytes))
        else:
            raise Exception('Downloaded file: %s (%d) bytes. DO NOT have the expected size : %d' % (local_file, statinfo.st_size, expected_bytes))

  from ._conv import register_converters as _register_converters


In [2]:
download_data('./text8.zip', 31344016)

./text8.zip already exists


In [3]:
import zipfile
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        # tf.compat.as_str helps to convert both bytes and unicode strings to unicode strings.
        # Python 3 and Python 2 independent
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [4]:
words = read_data('./text8.zip')

In [5]:
# define the max vocab size 
VOCAB_SIZE = 50000

In [6]:
import collections
def build_vocab(words, vocab_size):
    """Replace rare words with UNK"""
    counts = [['UNK', -1]]
    counts.extend(collections.Counter(words).most_common(vocab_size - 1))
    dictionary = dict()
    for word, _ in counts:
        dictionary[word] = len(dictionary)
    index_words = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        index_words.append(index)
    counts[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return index_words, counts, dictionary, reversed_dictionary

In [7]:
index_words, counts, dictionary, reversed_dictionary = build_vocab(words, VOCAB_SIZE)

In [8]:
import random
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [9]:
import numpy as np
def batch_gen(index_words, batch_size, skip_window):
    single_gen = generate_sample(index_words, skip_window)
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [10]:
for center_batch, target_batch in batch_gen(index_words, 10, 4):
    print(center_batch)
    print(target_batch)
    break

[5234 3081 3081 3081   12   12    6    6    6    6]
[[3081.]
 [5234.]
 [  12.]
 [   6.]
 [3081.]
 [   6.]
 [5234.]
 [3081.]
 [  12.]
 [ 195.]]


In [11]:
BATCH_SIZE = 128
SKIP_WINDOW = 1             # the context window
def gen():
    yield from batch_gen(index_words, BATCH_SIZE, SKIP_WINDOW)

In [19]:
# TO RESET THE TENSORFLOW GRAPH
tf.reset_default_graph()

In [20]:
dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))

In [21]:
def word2vec(dataset): 

    """ Build the graph for word2vec model and train it """
    # Step 1: get input, output from the dataset
    with tf.name_scope('data'):
        iterator = dataset.make_initializable_iterator()
        center_words, target_words = iterator.get_next()
    
    """ Step 2 + 3: define weights and embedding lookup.
        In word2vec, it's actually the weights that we care about
        """
    with tf.name_scope('embed'):
        embed_matrix = tf.get_variable('embed_matrix',
                                       shape=[VOCAB_SIZE, EMBED_SIZE],
                                       initializer=tf.random_uniform_initializer())
        embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding')

    # Step 4: construct variables for NCE loss and define loss function
    with tf.name_scope('loss'):
        nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
                                     initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
        tf.summary.histogram("nce_weight", nce_weight)
        
        nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
        tf.summary.histogram("nce_bias", nce_bias)                                        
        
        # define loss function to be NCE loss function
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                            biases=nce_bias,
                                            labels=target_words,
                                            inputs=embed,
                                            num_sampled=NUM_SAMPLED,
                                            num_classes=VOCAB_SIZE), name='loss')
        tf.summary.scalar("loss", loss)

    # Step 5: define optimizer
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

    #########################
    # defaults to saving all variables
    #########################
    saver = tf.train.Saver() 
    
    with tf.Session() as sess:
        sess.run(iterator.initializer)
        sess.run(tf.global_variables_initializer())

        #########################
        # Restore the checkpoint if it exists
        #########################
        ckpt = tf.train.get_checkpoint_state(os.path.dirname('./checkpoints/skip-gram'))
        # if that checkpoint exists, restore from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            print("restoring the checkpoint %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
            
        
        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)

        for index in range(NUM_TRAIN_STEPS):
            try:
                loss_batch, _ = sess.run([loss, optimizer])
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    [s] = sess.run([merged_summary])
                    writer.add_summary(s, index)
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                    total_loss = 0.0
                    ####################
                    # Save the checkpoint
                    ####################
                    saver.save(sess, 'checkpoints/skip-gram', index)
                    
            except tf.errors.OutOfRangeError:
                sess.run(iterator.initializer)
        writer.close()

In [22]:
# Model hyperparameters
VOCAB_SIZE = 50000
NUM_SAMPLED = 64            # number of negative examples to sample
EMBED_SIZE = 128            
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 30000
SKIP_STEP = 5000

In [23]:
word2vec(dataset)

restoring the checkpoint ./checkpoints/skip-gram-99999
INFO:tensorflow:Restoring parameters from ./checkpoints/skip-gram-99999
Average loss at step 4999:   4.5
Average loss at step 9999:   4.5


KeyboardInterrupt: 

In [24]:
# TO RESET THE TENSORFLOW GRAPH
tf.reset_default_graph()

In [26]:
dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))

In [27]:
word2vec(dataset)

restoring the checkpoint ./checkpoints/skip-gram-9999
INFO:tensorflow:Restoring parameters from ./checkpoints/skip-gram-9999
Average loss at step 4999:   4.4
Average loss at step 9999:   4.4
Average loss at step 14999:   4.5
Average loss at step 19999:   4.4
Average loss at step 24999:   4.5
Average loss at step 29999:   4.5
