# Word2Vec
As described in [A Word2Vec Keras tutorial](https://adventuresinmachinelearning.com/word2vec-keras-tutorial/)


![](https://i0.wp.com/adventuresinmachinelearning.com/wp-content/uploads/2017/08/Negative-sampling-architecture-1.jpg)

In [1]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, Lambda, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import urllib.request
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf


Using TensorFlow backend.


In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [3]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary


### Comments on above

#####  `data` is a vector of integers representing the text.
For example

    data[:10]
    [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
    
##### `count`     is an array of word count tuples
Where the count is the number of times that word appeared in the text

    [['UNK', 1737307],
     ('the', 1061396),
     ('of', 593677),
     ('and', 416629),
     ('one', 411764),
     ('in', 372201),
     
##### `dictionary` is the mapping (a Python dictionary) from words to their associated integers

    {'UNK': 0,
     'the': 1,
     'of': 2,
     'and': 3,
     'one': 4,
     'in': 5,
     'a': 6,
     
     
##### `reverse_dictionary` is the mapping (a Python dictionary) from integers to their associated words

     reverse_dictionary[5234]
     'anarchism'

In [4]:
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

window_size = 3
vector_dim = 300
epochs = 200000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
print("Making skipgrams")
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
print("Skipgrams complete")
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])
print("Done")

Found and verified text8.zip
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
[5234, 3081, 12, 6, 195, 2, 3134]
Making skipgrams
Skipgrams complete
[[4146, 7443], [319, 1129], [2125, 5114], [8736, 9584], [7562, 74], [7035, 3553], [4374, 198], [4025, 8934], [3223, 1796], [2518, 8192]] [0, 1, 1, 0, 1, 0, 1, 1, 1, 0]
Done


In [5]:
import keras.layers
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)reverse_dictionary[5234]
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)





## Setup Cosine Similarity Operation
 which will be output in a secondary model
 
 ```similarity = keras.layers.merge([target, context], mode='cos', dot_axes=0)```

In [6]:
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

similarity = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([target, context])

In [7]:

# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context],  1)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(input=[input_target, inpreverse_dictionary[5234]reverse_dictionary[5234]ut_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(input=[input_target, input_context], output=similarity)

  import sys
  # This is added back by InteractiveShellApp.init_path()


In [8]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_reverse_dictionary[5234]dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [9]:
data[:10]arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    #if cnt % 10000 == 0:
    #    sim_cb.run_sim()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Iteration 0, loss=0.6941439509391785
Iteration 100, loss=0.6880890727043152
Iteration 200, loss=0.6850840449333191
Iteration 300, loss=0.6926901936531067
Iteration 400, loss=0.6923497319221497
Iteration 500, loss=0.7010322213172913
Iteration 600, loss=0.6914136409759521
Iteration 700, loss=0.6848492622375488
Iteration 800, loss=0.694166898727417
Iteration 900, loss=0.6878600120544434
Iteration 1000, loss=0.7022818326950073
Iteration 1100, loss=0.7062225341796875
Iteration 1200, loss=0.6784406304359436
Iteration 1300, loss=0.6816210150718689
Iteration 1400, loss=0.6672200560569763
Iteration 1500, loss=0.7087903022766113
Iteration 1600, loss=0.7182415127754211
Iteration 1700, loss=0.6701890826225281
Iteration 1800, loss=0.6695064306259155
Iteration 1900, loss=0.7361277341842651
Iteration 2000, loss=0.6835259199142456
Iteration 2100, loss=0.6775702238082886
Iteration 2200, loss=0.7121732831001282
Iteration 2300, loss=0.6769914031028748
Iteration 2400, loss=0.6826419830322266
Iteration 250

Iteration 20400, loss=0.6622669100761414
Iteration 20500, loss=0.7170363664627075
Iteration 20600, loss=0.7304093241691589
Iteration 20700, loss=0.7161978483200073
Iteration 20800, loss=0.7165038585662842
Iteration 20900, loss=0.7226304411888123
Iteration 21000, loss=0.7572735548019409
Iteration 21100, loss=0.7154245376586914
Iteration 21200, loss=0.6586565375328064
Iteration 21300, loss=0.6768648028373718
Iteration 21400, loss=0.7105293273925781
Iteration 21500, loss=0.7291004657745361
Iteration 21600, loss=0.7143198847770691
Iteration 21700, loss=0.6677255630493164
Iteration 21800, loss=0.7091987133026123
Iteration 21900, loss=0.6813116669654846
Iteration 22000, loss=0.7038812637329102
Iteration 22100, loss=0.6754049062728882
Iteration 22200, loss=0.7089195847511292
Iteration 22300, loss=0.697666585445404
Iteration 22400, loss=0.6989480257034302
Iteration 22500, loss=0.7041927576065063
Iteration 22600, loss=0.692986786365509
Iteration 22700, loss=0.7045753002166748
Iteration 22800, l

KeyboardInterrupt: 

In [10]:
sampling_table

array([0.00315225, 0.00315225, 0.00547597, ..., 0.98914884, 0.98920336,
       0.98925789])

In [11]:
dictionary

{'UNK': 0,
 'the': 1,
 'of': 2,
 'and': 3,
 'one': 4,
 'in': 5,
 'a': 6,
 'to': 7,
 'zero': 8,
 'nine': 9,
 'two': 10,
 'is': 11,
 'as': 12,
 'eight': 13,
 'for': 14,
 's': 15,
 'five': 16,
 'three': 17,
 'was': 18,
 'by': 19,
 'that': 20,
 'four': 21,
 'six': 22,
 'seven': 23,
 'with': 24,
 'on': 25,
 'are': 26,
 'it': 27,
 'from': 28,
 'or': 29,
 'his': 30,
 'an': 31,
 'be': 32,
 'this': 33,
 'which': 34,
 'at': 35,
 'he': 36,
 'also': 37,
 'not': 38,
 'have': 39,
 'were': 40,
 'has': 41,
 'but': 42,
 'other': 43,
 'their': 44,
 'its': 45,
 'first': 46,
 'they': 47,
 'some': 48,
 'had': 49,
 'all': 50,
 'more': 51,
 'most': 52,
 'can': 53,
 'been': 54,
 'such': 55,
 'many': 56,
 'who': 57,
 'new': 58,
 'used': 59,
 'there': 60,
 'after': 61,
 'when': 62,
 'into': 63,
 'american': 64,
 'time': 65,
 'these': 66,
 'only': 67,
 'see': 68,
 'may': 69,
 'than': 70,
 'world': 71,
 'i': 72,
 'b': 73,
 'would': 74,
 'd': 75,
 'no': 76,
 'however': 77,
 'between': 78,
 'about': 79,
 'over': 80

In [12]:
data[:10]

[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [13]:
reverse_dictionary[5234]

'anarchism'

In [14]:
count

[['UNK', 1737307],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430),
 ('two', 192644),
 ('is', 183153),
 ('as', 131815),
 ('eight', 125285),
 ('for', 118445),
 ('s', 116710),
 ('five', 115789),
 ('three', 114775),
 ('was', 112807),
 ('by', 111831),
 ('that', 109510),
 ('four', 108182),
 ('six', 102145),
 ('seven', 99683),
 ('with', 95603),
 ('on', 91250),
 ('are', 76527),
 ('it', 73334),
 ('from', 72871),
 ('or', 68945),
 ('his', 62603),
 ('an', 61925),
 ('be', 61281),
 ('this', 58832),
 ('which', 54788),
 ('at', 54576),
 ('he', 53573),
 ('also', 44358),
 ('not', 44033),
 ('have', 39712),
 ('were', 39086),
 ('has', 37866),
 ('but', 35358),
 ('other', 32433),
 ('their', 31523),
 ('its', 29567),
 ('first', 28810),
 ('they', 28553),
 ('some', 28161),
 ('had', 28100),
 ('all', 26229),
 ('more', 26223),
 ('most', 25563),
 ('can', 25519),
 ('been', 25383),
 ('such', 24413),
 ('many',