# Credits
https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/examples/word2vec_utils.py  
https://github.com/chiphuyen/stanford-tensorflow-tutorials/blob/master/examples/04_word2vec.py   
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/tutorials/word2vec/word2vec_basic.py  


In [18]:
import os 
import urllib
import tensorflow as tf
URL = 'http://mattmahoney.net/dc/text8.zip'

def download_data(local_file, expected_bytes=None, url=URL):
    if os.path.exists(local_file):
        print('%s already exists' % local_file)
    else:
        local_file, _ = urllib.request.urlretrieve(url, local_file)    
        statinfo = os.stat(local_file)
        if expected_bytes or statinfo.st_size == expected_bytes:
            print('Downloaded file: %s has the expected size : %s' % (local_file, expected_bytes))
        else:
            raise Exception('Downloaded file: %s (%d) bytes. DO NOT have the expected size : %d' % (local_file, statinfo.st_size, expected_bytes))

In [19]:
download_data('./text8.zip', 31344016)

./text8.zip already exists


In [20]:
import zipfile
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        # tf.compat.as_str helps to convert both bytes and unicode strings to unicode strings.
        # Python 3 and Python 2 independent
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [21]:
words = read_data('./text8.zip')

In [24]:
words

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes',
 'of',
 'the',
 'french',
 'revolution',
 'whilst',
 'the',
 'term',
 'is',
 'still',
 'used',
 'in',
 'a',
 'pejorative',
 'way',
 'to',
 'describe',
 'any',
 'act',
 'that',
 'used',
 'violent',
 'means',
 'to',
 'destroy',
 'the',
 'organization',
 'of',
 'society',
 'it',
 'has',
 'also',
 'been',
 'taken',
 'up',
 'as',
 'a',
 'positive',
 'label',
 'by',
 'self',
 'defined',
 'anarchists',
 'the',
 'word',
 'anarchism',
 'is',
 'derived',
 'from',
 'the',
 'greek',
 'without',
 'archons',
 'ruler',
 'chief',
 'king',
 'anarchism',
 'as',
 'a',
 'political',
 'philosophy',
 'is',
 'the',
 'belief',
 'that',
 'rulers',
 'are',
 'unnecessary',
 'and',
 'should',
 'be',
 'abolished',
 'although',
 'there',
 'are',
 'differing',
 '

In [6]:
# define the max vocab size 
VOCAB_SIZE = 50000

In [7]:
import collections
def build_vocab(words, vocab_size):
    """Replace rare words with UNK"""
    counts = [['UNK', -1]]
    counts.extend(collections.Counter(words).most_common(vocab_size - 1))
    dictionary = dict()
    for word, _ in counts:
        dictionary[word] = len(dictionary)
    index_words = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        index_words.append(index)
    counts[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return index_words, counts, dictionary, reversed_dictionary

In [25]:
index_words, counts, dictionary, reversed_dictionary = build_vocab(words, VOCAB_SIZE)

In [29]:
reversed_dictionary

{0: 'UNK',
 1: 'the',
 2: 'of',
 3: 'and',
 4: 'one',
 5: 'in',
 6: 'a',
 7: 'to',
 8: 'zero',
 9: 'nine',
 10: 'two',
 11: 'is',
 12: 'as',
 13: 'eight',
 14: 'for',
 15: 's',
 16: 'five',
 17: 'three',
 18: 'was',
 19: 'by',
 20: 'that',
 21: 'four',
 22: 'six',
 23: 'seven',
 24: 'with',
 25: 'on',
 26: 'are',
 27: 'it',
 28: 'from',
 29: 'or',
 30: 'his',
 31: 'an',
 32: 'be',
 33: 'this',
 34: 'which',
 35: 'at',
 36: 'he',
 37: 'also',
 38: 'not',
 39: 'have',
 40: 'were',
 41: 'has',
 42: 'but',
 43: 'other',
 44: 'their',
 45: 'its',
 46: 'first',
 47: 'they',
 48: 'some',
 49: 'had',
 50: 'all',
 51: 'more',
 52: 'most',
 53: 'can',
 54: 'been',
 55: 'such',
 56: 'many',
 57: 'who',
 58: 'new',
 59: 'used',
 60: 'there',
 61: 'after',
 62: 'when',
 63: 'into',
 64: 'american',
 65: 'time',
 66: 'these',
 67: 'only',
 68: 'see',
 69: 'may',
 70: 'than',
 71: 'world',
 72: 'i',
 73: 'b',
 74: 'would',
 75: 'd',
 76: 'no',
 77: 'however',
 78: 'between',
 79: 'about',
 80: 'over'

In [9]:
import random
def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [10]:
import numpy as np
def batch_gen(index_words, batch_size, skip_window):
    single_gen = generate_sample(index_words, skip_window)
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros([batch_size, 1])
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(single_gen)
        yield center_batch, target_batch

In [30]:
for center_batch, target_batch in batch_gen(index_words, 10, 4):
    print(center_batch)
    print(target_batch)
    break

[5234 3081 3081 3081 3081 3081   12   12    6    6]
[[3.081e+03]
 [5.234e+03]
 [1.200e+01]
 [6.000e+00]
 [1.950e+02]
 [2.000e+00]
 [3.081e+03]
 [6.000e+00]
 [1.200e+01]
 [1.950e+02]]


In [31]:
BATCH_SIZE = 128
SKIP_WINDOW = 4             # the context window
def gen():
    yield from batch_gen(index_words, BATCH_SIZE, SKIP_WINDOW)

In [32]:
# TO RESET THE TENSORFLOW GRAPH
tf.reset_default_graph()

In [33]:
dataset = tf.data.Dataset.from_generator(gen, 
                                (tf.int32, tf.int32), 
                                (tf.TensorShape([BATCH_SIZE]), tf.TensorShape([BATCH_SIZE, 1])))

In [34]:
def word2vec(dataset): 

    """ Build the graph for word2vec model and train it """
    # Step 1: get input, output from the dataset
    with tf.name_scope('data'):
        iterator = dataset.make_initializable_iterator()
        center_words, target_words = iterator.get_next()
    
    """ Step 2 + 3: define weights and embedding lookup.
        In word2vec, it's actually the weights that we care about
        """
    with tf.name_scope('embed'):
        embed_matrix = tf.get_variable('embed_matrix',
                                       shape=[VOCAB_SIZE, EMBED_SIZE],
                                       initializer=tf.random_uniform_initializer())
        embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embedding')

    # Step 4: construct variables for NCE loss and define loss function
    with tf.name_scope('loss'):
        nce_weight = tf.get_variable('nce_weight', shape=[VOCAB_SIZE, EMBED_SIZE],
                                     initializer=tf.truncated_normal_initializer(stddev=1.0 / (EMBED_SIZE ** 0.5)))
        tf.summary.histogram("nce_weight", nce_weight)
        
        nce_bias = tf.get_variable('nce_bias', initializer=tf.zeros([VOCAB_SIZE]))
        tf.summary.histogram("nce_bias", nce_bias)                                        
        
        # define loss function to be NCE loss function
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                            biases=nce_bias,
                                            labels=target_words,
                                            inputs=embed,
                                            num_sampled=NUM_SAMPLED,
                                            num_classes=VOCAB_SIZE), name='loss')
        tf.summary.scalar("loss", loss)

    # Step 5: define optimizer
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)

    with tf.Session() as sess:
        sess.run(iterator.initializer)
        sess.run(tf.global_variables_initializer())

        total_loss = 0.0 # we use this to calculate late average loss in the last SKIP_STEP steps
        merged_summary = tf.summary.merge_all()
        writer = tf.summary.FileWriter('graphs/word2vec_simple', sess.graph)

        for index in range(NUM_TRAIN_STEPS):
            try:
                loss_batch, _ = sess.run([loss, optimizer])
                total_loss += loss_batch
                if (index + 1) % SKIP_STEP == 0:
                    [s] = sess.run([merged_summary])
                    writer.add_summary(s, index)
                    print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                    total_loss = 0.0
            except tf.errors.OutOfRangeError:
                sess.run(iterator.initializer)
        writer.close()

In [35]:
# Model hyperparameters
VOCAB_SIZE = 50000
NUM_SAMPLED = 64            # number of negative examples to sample
EMBED_SIZE = 128            
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 100000
SKIP_STEP = 5000

In [36]:
word2vec(dataset)

Average loss at step 4999:  65.6
Average loss at step 9999:  18.6
Average loss at step 14999:   9.9
Average loss at step 19999:   7.6
Average loss at step 24999:   6.2
Average loss at step 29999:   5.6
Average loss at step 34999:   5.4
Average loss at step 39999:   5.2
Average loss at step 44999:   5.1
Average loss at step 49999:   5.2
Average loss at step 54999:   5.1
Average loss at step 59999:   5.1
Average loss at step 64999:   5.1
Average loss at step 69999:   5.1
Average loss at step 74999:   5.0
Average loss at step 79999:   5.1
Average loss at step 84999:   5.1
Average loss at step 89999:   5.1
Average loss at step 94999:   5.0
Average loss at step 99999:   5.0


In [1]:
x

NameError: name 'x' is not defined