### Word2Vec

In [64]:
# necessary imports
from __future__ import division, print_function, absolute_import
import collections
import os
import random
# to download from a particular website
import urllib
import zipfile
import numpy as np
import tensorflow as tf

In [65]:
# download the text corpus
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
    print("Downloading the dataset...")
    filename, _ = urllib.request.urlretrieve(url, data_path)
    print("Done!")
# unzip the file and read the dataset mfile by file and append inside a list 
with zipfile.ZipFile(data_path) as f:
    # f.namelist() reads full filepath of files and first element is the filename
    text_words = f.read(f.namelist()[0]).lower().split()

In [66]:
# count reverse by steps and endpoint
# for i in range(16 - 1, -1, -1):
#     print(i)
# example = dict()
# example['mouse'] = 1
# example['keyboard'] = 2
# example['CPU'] = 3
# example.get('CPUS', 0)

In [67]:
# we need a max vocab size to limit the vocab
max_vocab_size = 50000
# extend adds to a list as it is described and append decouples individual elements and adds to the element
count = [('UNK', -1)]
# https://www.hackerrank.com/challenges/collections-counter/problem to understand how counter works
count.extend(collections.Counter(text_words).most_common(max_vocab_size - 1))

In [68]:
str_count = list()
for i in range(len(count) - 1):
    if i !=0 :
        str_count.append((count[i][0].decode("utf-8"), count[i][1]))
    else:
        str_count.append((count[i][0], count[i][1]))

In [69]:
# We need a min count of individual words so that we don't have to work with rare words
min_occurrence = 10
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
# we are popping out infrequent words out of the vocab
for i in range(len(str_count) -1, -1, -1):
    if str_count[i][1] < min_occurrence:
        str_count.pop(i)
    elif str_count[i][0] in stops:
        str_count.pop(i)
    else:
        pass
# compute vocab size
vocab_size = len(str_count)
# assign ids to individual words
word2id = dict()
for i, (word, _) in enumerate(str_count):
    word2id[word] = i

data = list()
unk_count = 0
# retrieve a word id or assign it to index 0 ('UNK') if it's not in the dictionary
for word in text_words:
    # either you get the id of the word; if not found then return 0
    index = word2id.get(word, 0)
    if index == 0:
        unk_count += 1
    data.append(index)
# changing the count of unknown words in the dictionary
str_count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))

print("word count: ", len(text_words))
print("unique word count: ", len(set(text_words)))
print("vocab size: ", vocab_size)
print("top 10 words: ", str_count[:10])

word count:  17005207
unique word count:  253854
vocab size:  46984
top 10 words:  [('UNK', 17005207), ('zero', 264975), ('nine', 250430), ('two', 192644), ('eight', 125285), ('five', 115789), ('three', 114775), ('four', 108182), ('six', 102145), ('seven', 99683)]


In [70]:
batch_size = 128
skip_window = 3 # how many words to consider on left and right
num_skips = 2 # how many times to consider an input to generate labels
neg_samples = 64 # Number of negative examples to sample
# to understand the parameters
# https://stackoverflow.com/questions/47302947/understanding-input-and-labels-in-word2vec-tensorflow
def next_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    span = 2 * skip_window + 1
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span

#### Example 1 - num_skips=4
batch, labels = generate_batch(batch_size=8, num_skips=4, skip_window=2)  
It generates 4 labels for each word, i.e. uses the whole context; since batch_size=8 only 2 words are processed in this batch (12 and 6), the rest will go into the next batch:  

data = [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156, 128, 742, 477, 10572, ...]  
batch = [12 12 12 12  6  6  6  6]  
labels = [[6 3084 5239 195 195 3084 12 2]]  

#### Example 2 - num_skips=2  
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=2)  
Here you would expect each word appear twice in the batch sequence; the 2 labels are randomly sampled from 4 possible words:  

data = [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156, 128, 742, 477, 10572, ...]  
batch = [ 12  12   6   6 195 195   2   2]  
labels = [[ 195 3084   12  195 3137   12   46  195]]  

#### Example 3 - num_skips=1  
batch, labels = generate_batch(batch_size=8, num_skips=1, skip_window=2)  
Finally, this setting, same as yours, produces exactly one label per each word; each label is drawn randomly from the 4-word context:  

data = [5239, 3084, 12, 6, 195, 2, 3137, 46, 59, 156, 128, 742, 477, 10572, ...]  
batch = [  12    6  195    2 3137   46   59  156]  
labels = [[  6  12  12 195  59 156  46  46]]  

In [74]:
# Word2Vec global Parameters
embedding_size = 200 # dimension of an embedding vector
max_vocabulary_size = 50000 # total number of different words that should be in the vocabulary
min_occurrence = 10 # remove all words that does not appears at least n=10 times
num_sampled = 64 # number of negative examples to sample

In [72]:
data_index = 0 # initialize
# Generate training batch for the skip-gram model
def next_batch(batch_size, num_skips, skip_window):
    global data_index # pointing to the out of scope variable
    assert batch_size % num_skips == 0 # batchsize should be multiple of num_skips; see above example
    assert num_skips <= 2 * skip_window # so one can get proper sampled labels from both side of the input
    batch = np.ndarray(shape=(batch_size), dtype=np.int32) # see above input
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) # see above input
    # get window size (words left and right + current one)
    span = 2 * skip_window + 1 # verbose
    buffer = collections.deque(maxlen=span) # double ended queue to hold the set (w_-2, w_-1, w, w_+1, w_+2)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    """
    setting up ip, label parallelly 
    batch = [12 12 12 12 6 6 6 6]
    labels = [[6 3084 5239 195 195 3084 12 2]]
    """ 
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window] # choosing all context words
        words_to_use = random.sample(context_words, num_skips) # randomly sample context words
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window] # choosing 12, 12, 12, 12 iteratively
            labels[i * num_skips + j, 0] = buffer[context_word] # choosing 6, 3084, 5239, 195
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data) # so that we get all the words for training
    return batch, labels

#### Understand 'None' shape  
https://stackoverflow.com/questions/46940857/what-is-the-difference-between-none-none-and-for-the-shape-of-a-placeh  
TensorFlow uses arrays rather than tuples. It converts tuples to arrays. Therefore [] and () are equivalent.  

Now, consider this code example:  

x = tf.placeholder(dtype=tf.int32, shape=[], name="foo1")  
y = tf.placeholder(dtype=tf.int32, shape=[None], name="foo2")  
z = tf.placeholder(dtype=tf.int32, shape=None, name="foo3")  

val1 = np.array((1, 2, 3))  
val2 = 45  

with tf.Session() as sess:  
    sess.run(tf.global_variables_initializer())  

    #print(sess.run(x, feed_dict = {x: val1}))  # Fails  
    print(sess.run(y, feed_dict = {y: val1}))  
    print(sess.run(z, feed_dict = {z: val1}))  
  
    print(sess.run(x, feed_dict = {x: val2}))  
    #print(sess.run(y, feed_dict = {y: val2}))  # Fails  
    print(sess.run(z, feed_dict = {z: val2}))  
As can be seen, placeholder with [] shape takes a single scalar value directly. Placeholder with [None] shape   takes a 1-dimensional array and placeholder with None shape can take in any value while computation takes place.

#### NCE Loss: https://mk-minchul.github.io/NCE/

#### keep_dims tutorial
- a  = array([[0, 0, 0],[0, 1, 0],[0, 2, 0],[1, 0, 0],[1, 1, 0]])  
- np.sum(a, keepdims=True) = array([[6]])  
- np.sum(a, keepdims=False) = 6  
- np.sum(a, axis=1, keepdims=True) = array([[0],[1],[2],[1],[2]])  
- np.sum(a, axis=1, keepdims=False) = array([0, 1, 2, 1, 2])     
- np.sum(a, axis=0, keepdims=True) = array([[2, 4, 0]])    
- np.sum(a, axis=0, keepdims=False) = array([2, 4, 0])  

#### transpose_b = True tutorial
x = tf.constant([1.,2.,3.], shape = (3,2,4))  
y = tf.constant([1.,2.,3.], shape = (3,21,4))  
tf.matmul(x,y)                     # Doesn't work.   
tf.matmul(x,y,transpose_b = True)  # This works. Shape is (3,2,21)  
tf.matmul(x,tf.transpose(y))       # Doesn't work.  

In [79]:
# training parameters
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000

In [80]:
# setting up inputs
X = tf.placeholder(tf.int32, shape=[None])
# setting up labels
Y = tf.placeholder(tf.int32, shape=[None, 1])
with tf.device('/cpu:0'):
    # embedding layer variable (each row represent a word embedding vector)
    embedding = tf.Variable(tf.random_normal([vocab_size, embedding_size]))
    # lookup for corresponding embedding
    X_embed = tf.nn.embedding_lookup(embedding, X)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(tf.random_normal([vocab_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocab_size]))
    
# computation of average NCE loss for a batch
loss_op = tf.reduce_mean(
    tf.nn.nce_loss(weights=nce_weights,
                   biases=nce_biases,
                   labels=Y,
                   inputs=X_embed,
                   num_sampled=num_sampled,
                   num_classes=vocab_size))

# optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op)

# computing similarity between input and embeddings
X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)

In [84]:
# evaluation parameters
eval_words = ['five', 'man', 'going', 'hardware', 'american', 'britain']

In [None]:
# initializing the variables
init = tf.global_variables_initializer()

with tf.Session() as sess:
    # Run the initializer
    sess.run(init)
    # Testing data
    x_test = np.array([word2id[w] for w in eval_words])
    average_loss = 0
    for step in range(1, num_steps + 1):
        # Get a new batch of data
        batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
        # Run training op
        _, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
        average_loss += loss
        if step % display_step == 0 or step == 1:
            if step > 1:
                average_loss /= display_step
            print("Step " + str(step) + ", Average Loss= " + \
                  "{:.4f}".format(average_loss))
            average_loss = 0
        # evaluation
        if step % eval_step == 0 or step == 1:
            print("Evaluation...")
            sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
            for i in range(len(eval_words)):
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = '"%s" nearest neighbors:' % eval_words[i]
                for k in range(top_k):
                    log_str = '%s %s,' % (log_str, id2word[nearest[k]])
                print(log_str)