[source](https://adventuresinmachinelearning.com/word2vec-keras-tutorial/)

## step 1: get the online data (based on tensorflow tutorial on word2vec)

In [1]:
import tensorflow as tf
import os
import urllib
import zipfile
from tempfile import gettempdir
import collections


def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                       local_filename)
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                      '. Can you get to it with a browser?')
    return local_filename

def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        print(f.namelist())
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    ## only consider top n_words in the file
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1)) # get top 50k words in frequency
    dictionary = {}
    for word, _ in count:
        dictionary[word] = len(dictionary) # index those top words by frequency ranking
    data = []
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index) # collect indices of all words
    count[0][1] = unk_count # update the UNK count, which is everything else except top 50k-1
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    print(vocabulary[:7])
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary

#### pull the online data, and transform it
vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])


Found and verified text8.zip
['text8']
['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse']
[5234, 3081, 12, 6, 195, 2, 3134]


## step 2: constants and the validation set

In [2]:
import numpy as np

In [3]:
window_size = 3
vector_dim = 300
epochs = 200000

valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples

array([80,  8, 22, 74,  7, 70, 25,  1, 95, 51, 82, 45, 67,  5, 40, 30])

## step 3: the skip-gram function in Keras

In [4]:
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams

Using TensorFlow backend.


In [5]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[2575, 8045], [168, 501], [28, 11], [5753, 3438], [4135, 6277], [1903, 707], [174, 303], [7110, 1036], [4670, 5610], [6255, 1792]] [0, 0, 1, 0, 0, 1, 1, 0, 0, 1]


In [6]:
len(word_target),len(word_context)

(30031868, 30031868)

In [7]:
len(data), len(count), len(dictionary)

(17005207, 10000, 10000)

In [8]:
len(labels), sum(labels)

(30031868, 15015934)

In [9]:
15000290/30000580 # numbers of postive and negative samples are the same

0.5

## step 4: create the embedding layer

In [10]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from keras.layers.embeddings import Embedding

In [11]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
context = Reshape((vector_dim, 1))(context)

W0717 11:00:02.348072 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0717 11:00:02.377457 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0717 11:00:02.387816 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



## step 5: finish the network architecture

In [12]:
#### based on old keras modules
# # setup a cosine similarity operation which will be output in a secondary model
# similarity = merge([target, context], mode='cos', dot_axes=0)

# # now perform the dot product operation to get a similarity measure
# dot_product = merge([target, context], mode='dot', dot_axes=1)
# dot_product = Reshape((1,))(dot_product)
# # add the sigmoid output layer
# output = Dense(1, activation='sigmoid')(dot_product)
# # create the primary training model
# model = Model(input=[input_target, input_context], output=output)
# model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# # create a secondary validation model to run our similarity checks during training
# validation_model = Model(input=[input_target, input_context], output=similarity)

In [13]:
import keras

In [14]:
# setup a cosine similarity operation which will be output in a secondary model
similarity = keras.layers.Dot(axes=1, normalize=True)([target, context])

# now perform the dot product operation to get a similarity measure
dot_product = keras.layers.Dot(axes=1, normalize=False)([target, context])
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model(input=[input_target, input_context], output=output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

# create a secondary validation model to run our similarity checks during training
validation_model = Model(input=[input_target, input_context], output=similarity)

  # Remove the CWD from sys.path while we load stuff.
W0717 14:46:34.783025 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0717 14:46:34.798624 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W0717 14:46:34.802967 4713334208 deprecation.py:323] From /anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
  


## step 6: create the similarity callback

In [15]:
reverse_dictionary[80]

'over'

In [16]:
vocab_size

10000

In [17]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1] ## get the indices of words with maximum similarities
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

In [18]:
arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()

W0717 15:25:55.007726 4713334208 deprecation_wrapper.py:119] From /anaconda3/envs/py36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:986: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.



Iteration 0, loss=0.7009079456329346
Nearest to over: significance, sultan, subdivided, mazda, and, editor, direction, style,
Nearest to zero: treatments, murray, fungi, coverage, schemes, frontier, escaped, ceo,
Nearest to six: hymn, scenes, recordings, frankfurt, comoros, honda, episodes, allowed,
Nearest to would: transmit, sanctuary, lisa, weaker, hub, volumes, pedro, ancestry,
Nearest to to: squad, bo, instruments, subsidiary, programmes, sovereign, manpower, homeland,
Nearest to than: genesis, afc, arabic, hebrew, rebuilt, guerrilla, seemed, cold,
Nearest to on: died, attempt, pioneers, carol, strand, commodore, shuttle, intention,
Nearest to the: apache, radiation, byte, agricultural, heat, drunk, fourth, managing,
Nearest to so: nuclei, collapsed, canterbury, perry, bart, discovering, release, census,
Nearest to more: wto, india, impressed, legal, boom, landscape, hull, russians,
Nearest to states: barnes, drinking, thinking, croatian, generic, black, cancer, descriptive,
Neare

Iteration 13700, loss=0.6868282556533813
Iteration 13800, loss=0.7031811475753784
Iteration 13900, loss=0.7020870447158813
Iteration 14000, loss=0.7105399966239929
Iteration 14100, loss=0.694953441619873
Iteration 14200, loss=0.6881732940673828
Iteration 14300, loss=0.6862485408782959
Iteration 14400, loss=0.6981500387191772
Iteration 14500, loss=0.686954140663147
Iteration 14600, loss=0.7026594877243042
Iteration 14700, loss=0.7102077603340149
Iteration 14800, loss=0.6885406374931335
Iteration 14900, loss=0.6868906617164612
Iteration 15000, loss=0.6880958676338196
Iteration 15100, loss=0.6919358372688293
Iteration 15200, loss=0.7240684032440186
Iteration 15300, loss=0.7001110315322876
Iteration 15400, loss=0.6927030086517334
Iteration 15500, loss=0.7109410762786865
Iteration 15600, loss=0.7077505588531494
Iteration 15700, loss=0.6788397431373596
Iteration 15800, loss=0.7084572911262512
Iteration 15900, loss=0.742889940738678
Iteration 16000, loss=0.714372456073761
Iteration 16100, los

Nearest to to: with, the, of, head, a, in, be, and,
Nearest to than: more, sleep, longer, weapon, five, be, civilizations, across,
Nearest to on: surgery, brands, transmitted, guitar, philosophical, indeed, sort, totally,
Nearest to the: of, in, four, one, to, and, nine, seven,
Nearest to so: refined, free, tension, inserted, doing, min, market, fl,
Nearest to more: than, finish, longer, gregory, training, eu, farther, shall,
Nearest to states: black, crash, participated, from, conscious, ammonia, wife, franco,
Nearest to its: knights, enemies, characterized, humanism, territories, because, mainland, screen,
Nearest to only: white, after, axioms, times, fully, design, protocol, eve,
Nearest to in: the, one, russia, but, a, of, rail, seven,
Nearest to were: removed, service, fire, rated, surroundings, agents, placing, clergy,
Nearest to his: of, takes, protagonist, kai, he, the, tribes, foster,
Iteration 30100, loss=0.6883077621459961
Iteration 30200, loss=0.7079841494560242
Iteration 3

Iteration 45200, loss=0.7020692825317383
Iteration 45300, loss=0.6833928227424622
Iteration 45400, loss=0.6811521053314209
Iteration 45500, loss=0.6023934483528137
Iteration 45600, loss=0.6793574690818787
Iteration 45700, loss=0.7900888323783875
Iteration 45800, loss=0.6824355721473694
Iteration 45900, loss=0.6988028883934021
Iteration 46000, loss=0.6467723250389099
Iteration 46100, loss=0.6169424057006836
Iteration 46200, loss=0.6276608109474182
Iteration 46300, loss=0.27480804920196533
Iteration 46400, loss=0.7071689367294312
Iteration 46500, loss=0.6608577370643616
Iteration 46600, loss=0.6563248038291931
Iteration 46700, loss=0.7581914067268372
Iteration 46800, loss=0.8004084229469299
Iteration 46900, loss=0.6578397750854492
Iteration 47000, loss=0.5806893706321716
Iteration 47100, loss=0.7536967396736145
Iteration 47200, loss=0.7847437262535095
Iteration 47300, loss=0.733802318572998
Iteration 47400, loss=0.8913204073905945
Iteration 47500, loss=0.6621750593185425
Iteration 47600,

KeyboardInterrupt: 