# Word2Vec
As described in [A Word2Vec Keras tutorial](https://adventuresinmachinelearning.com/word2vec-keras-tutorial/)


![](https://i0.wp.com/adventuresinmachinelearning.com/wp-content/uploads/2017/08/Negative-sampling-architecture-1.jpg)

In [1]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, Lambda, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import urllib.request
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf


Using TensorFlow backend.


In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes || 1 == 1:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename


# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

In [4]:
# Read the data into a list of strings.
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data("data1.zip")
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary


### Comments on above

#####  `data` is a vector of integers representing the text.
For example

    data[:10]
    [5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]
    
##### `count`     is an array of word count tuples
Where the count is the number of times that word appeared in the text

    [['UNK', 1737307],
     ('the', 1061396),
     ('of', 593677),
     ('and', 416629),
     ('one', 411764),
     ('in', 372201),
     
##### `dictionary` is the mapping (a Python dictionary) from words to their associated integers

    {'UNK': 0,
     'the': 1,
     'of': 2,
     'and': 3,
     'one': 4,
     'in': 5,
     'a': 6,
     
     
##### `reverse_dictionary` is the mapping (a Python dictionary) from integers to their associated words

     reverse_dictionary[5234]
     'anarchism'

In [46]:
vocab_size = 25
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

window_size = 1
vector_dim = 4
epochs = 200000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

sampling_table = sequence.make_sampling_table(vocab_size)
print("Making skipgrams")
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
print("Skipgrams complete")
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])
print("Done")

Found and verified text8.zip
['the', 'lazy', 'old', 'dog', 'sat', 'on', 'the']
[1, 2, 3, 11, 4, 5, 1]
Making skipgrams
Skipgrams complete
[[12, 8], [5, 9], [17, 4], [17, 3], [12, 14], [20, 15], [20, 3], [12, 4], [14, 21], [11, 4]] [0, 0, 1, 1, 0, 0, 1, 1, 0, 1]
Done


In [45]:
#len(couples)
def printPair(e):
    pair = e[0]
    label = e[1]
    print(reverse_dictionary[pair[0]], reverse_dictionary[pair[1]], label)
    
for couple in zip(couples, labels):    
    printPair(couple)
#for w in data:
#    print(reverse_dictionary[w])
max(data)


sat cow 0
chair. the 1
sat on 1
on the 1
chair. lazy 0
sat cat 0
on sat 1
the lazy 1
on the 1
on sat 1
on toad 0
the poodle 0
sat pig 1
on bear 0
on frog 0
chair. the 1
the bird 0
chair. lazy 0
the couch. 1
on ant 0


23

In [48]:
import keras.layers
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)





## Setup Cosine Similarity Operation
 which will be output in a secondary model
 
 ```similarity = keras.layers.merge([target, context], mode='cos', dot_axes=0)```

In [49]:
from keras import backend as K

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

similarity = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([target, context])

In [51]:

# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context],  1)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(input=[input_target, input_context], output=similarity)

  
  if sys.path[0] == '':


In [53]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [58]:
epochs = 5000

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 5 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    #if cnt % 10000 == 0:
    #    sim_cb.run_sim()

Iteration 0, loss=0.6981566548347473
Iteration 5, loss=0.6829057335853577
Iteration 10, loss=0.6819024682044983
Iteration 15, loss=0.6837921738624573
Iteration 20, loss=0.6953096985816956
Iteration 25, loss=0.6962173581123352
Iteration 30, loss=0.6916033029556274
Iteration 35, loss=0.6869698166847229
Iteration 40, loss=0.6945284605026245
Iteration 45, loss=0.6833014488220215
Iteration 50, loss=0.6877935528755188
Iteration 55, loss=0.6887046098709106
Iteration 60, loss=0.6937412619590759
Iteration 65, loss=0.6854725480079651
Iteration 70, loss=0.6796717643737793
Iteration 75, loss=0.6833245158195496
Iteration 80, loss=0.6820929646492004
Iteration 85, loss=0.6778215765953064
Iteration 90, loss=0.6936081647872925
Iteration 95, loss=0.6771029829978943
Iteration 100, loss=0.6859501004219055
Iteration 105, loss=0.6809224486351013
Iteration 110, loss=0.6759130358695984
Iteration 115, loss=0.6847410202026367
Iteration 120, loss=0.6749975681304932
Iteration 125, loss=0.6808058619499207
Iteratio

Iteration 1100, loss=0.06314966827630997
Iteration 1105, loss=0.19261281192302704
Iteration 1110, loss=0.1804337203502655
Iteration 1115, loss=0.05840502679347992
Iteration 1120, loss=0.040040940046310425
Iteration 1125, loss=0.16821308434009552
Iteration 1130, loss=0.1982746124267578
Iteration 1135, loss=0.1749218851327896
Iteration 1140, loss=0.42108631134033203
Iteration 1145, loss=0.19781191647052765
Iteration 1150, loss=0.6968172192573547
Iteration 1155, loss=0.1991548389196396
Iteration 1160, loss=0.1355084478855133
Iteration 1165, loss=0.1879572570323944
Iteration 1170, loss=0.37981826066970825
Iteration 1175, loss=0.38915306329727173
Iteration 1180, loss=0.15832969546318054
Iteration 1185, loss=0.18492698669433594
Iteration 1190, loss=0.38033878803253174
Iteration 1195, loss=0.6772533655166626
Iteration 1200, loss=0.1465597152709961
Iteration 1205, loss=0.3516482412815094
Iteration 1210, loss=0.1426699012517929
Iteration 1215, loss=0.060782693326473236
Iteration 1220, loss=0.03

Iteration 2185, loss=0.031060341745615005
Iteration 2190, loss=0.003262300742790103
Iteration 2195, loss=0.043989118188619614
Iteration 2200, loss=0.005497150123119354
Iteration 2205, loss=0.0029377532191574574
Iteration 2210, loss=0.00014151146751828492
Iteration 2215, loss=0.0007971721352078021
Iteration 2220, loss=0.04144613817334175
Iteration 2225, loss=0.008924012072384357
Iteration 2230, loss=2.7776157367043197e-05
Iteration 2235, loss=0.0011793774319812655
Iteration 2240, loss=7.033374004095094e-06
Iteration 2245, loss=0.028556592762470245
Iteration 2250, loss=0.026632016524672508
Iteration 2255, loss=0.13539950549602509
Iteration 2260, loss=0.0038316843565553427
Iteration 2265, loss=0.0002870375174097717
Iteration 2270, loss=5.006800165574532e-06
Iteration 2275, loss=1.919287387863733e-05
Iteration 2280, loss=0.12691479921340942
Iteration 2285, loss=0.6022212505340576
Iteration 2290, loss=0.02034369483590126
Iteration 2295, loss=0.0059008789248764515
Iteration 2300, loss=2.6458

Iteration 3195, loss=4.291543518775143e-06
Iteration 3200, loss=0.0025599044747650623
Iteration 3205, loss=0.5932435393333435
Iteration 3210, loss=3.1312392820836976e-05
Iteration 3215, loss=1.192093321833454e-07
Iteration 3220, loss=0.0010980465449392796
Iteration 3225, loss=0.0023342773783951998
Iteration 3230, loss=1.0848104466276709e-05
Iteration 3235, loss=2.794676402118057e-05
Iteration 3240, loss=0.006219996605068445
Iteration 3245, loss=8.344653110725631e-07
Iteration 3250, loss=1.192093321833454e-07
Iteration 3255, loss=2.6226073259749683e-06
Iteration 3260, loss=1.0000001537946446e-07
Iteration 3265, loss=0.0009512025862932205
Iteration 3270, loss=7.394205749733374e-05
Iteration 3275, loss=1.192093321833454e-07
Iteration 3280, loss=0.0009583652135916054
Iteration 3285, loss=1.192093321833454e-07
Iteration 3290, loss=0.005477644968777895
Iteration 3295, loss=0.49990761280059814
Iteration 3300, loss=6.186535028973594e-05
Iteration 3305, loss=0.488267183303833
Iteration 3310, lo

Iteration 4255, loss=1.0000001537946446e-07
Iteration 4260, loss=4.424536746228114e-05
Iteration 4265, loss=2.0472942196647637e-05
Iteration 4270, loss=1.192093321833454e-07
Iteration 4275, loss=0.0004995472263544798
Iteration 4280, loss=0.00047669396735727787
Iteration 4285, loss=4.768372718899627e-07
Iteration 4290, loss=1.4029757267053355e-07
Iteration 4295, loss=4.768372718899627e-07
Iteration 4300, loss=0.00045224526547826827
Iteration 4305, loss=1.192093321833454e-07
Iteration 4310, loss=1.192093321833454e-07
Iteration 4315, loss=1.192093321833454e-07
Iteration 4320, loss=0.0043168035335838795
Iteration 4325, loss=3.425140312174335e-05
Iteration 4330, loss=1.192093321833454e-07
Iteration 4335, loss=1.192093321833454e-07
Iteration 4340, loss=4.907952188659692e-06
Iteration 4345, loss=3.074116466450505e-05
Iteration 4350, loss=3.576278118089249e-07
Iteration 4355, loss=9.785660950001329e-05
Iteration 4360, loss=1.192093321833454e-07
Iteration 4365, loss=9.565762593410909e-05
Iterat

In [70]:
sim_cb.run_sim()

ValueError: setting an array element with a sequence.

In [69]:
valid_examples = np.array( [6, 7, 8, 11, 12, 13, 14])

In [12]:
data[:10]

[5234, 3081, 12, 6, 195, 2, 3134, 46, 59, 156]

In [13]:
reverse_dictionary[5234]

'anarchism'

In [67]:
valid_examples

array([ 6,  7,  8, 11, 12, 13, 14])