In [1]:
from keras.models import Model
from keras.layers import Input, Dense, Reshape, concatenate
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence

import urllib
import collections
import os
import zipfile

import numpy as np
import tensorflow as tf

Using TensorFlow backend.


In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary






vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
[5234, 3081, 12, 6, 195, 2, 3134]


In [3]:
window_size = 3
vector_dim = 300
epochs = 1000000


valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.

valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples


array([66, 56, 95, 57, 45, 14, 41,  8, 49, 26, 16, 88, 15, 12, 85, 61])

In [4]:
sampling_table = sequence.make_sampling_table(vocab_size)
couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)
word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[1171, 3], [868, 7882], [7174, 6995], [3647, 2636], [6818, 5126], [1772, 7891], [41, 1], [2688, 1922], [2411, 885], [2244, 55]] [1, 1, 0, 0, 0, 0, 1, 0, 1, 1]


In [5]:
# create some input variables
input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding')
target = embedding(input_target)
print (target)
target = Reshape((vector_dim, 1))(target)
context = embedding(input_context)
print (context)
context = Reshape((vector_dim, 1))(context)

Tensor("embedding/Identity:0", shape=(None, 1, 300), dtype=float32)
Tensor("embedding_1/Identity:0", shape=(None, 1, 300), dtype=float32)


In [6]:
# setup a cosine similarity operation which will be output in a secondary model
# similarity = concatenate([target, context], mode='cos', dot_axes=0)
from keras.layers import dot

similarity = dot([target, context], axes=1, normalize=True)

In [7]:
# now perform the dot product operation to get a similarity measure
dot_product = dot([target, context], axes=1, normalize=True)
dot_product = Reshape((1,))(dot_product)
# add the sigmoid output layer
output = Dense(1, activation='sigmoid')(dot_product)
# create the primary training model
model = Model([input_target, input_context],output)
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

In [8]:
validation_model = Model([input_target, input_context],similarity)

In [9]:
class SimilarityCallback:
    def run_sim(self):
        for i in range(valid_size):
            valid_word = reverse_dictionary[valid_examples[i]]
            top_k = 8  # number of nearest neighbors
            sim = self._get_sim(valid_examples[i])
            nearest = (-sim).argsort()[1:top_k + 1]
            log_str = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str = '%s %s,' % (log_str, close_word)
            print(log_str)

    @staticmethod
    def _get_sim(valid_word_idx):
        sim = np.zeros((vocab_size,))
        in_arr1 = np.zeros((1,))
        in_arr2 = np.zeros((1,))
        in_arr1[0,] = valid_word_idx
        for i in range(vocab_size):
            in_arr2[0,] = i
            out = validation_model.predict_on_batch([in_arr1, in_arr2])
            sim[i] = out
        return sim
sim_cb = SimilarityCallback()

arr_1 = np.zeros((1,))
arr_2 = np.zeros((1,))
arr_3 = np.zeros((1,))
for cnt in range(epochs):
    idx = np.random.randint(0, len(labels)-1)
    arr_1[0,] = word_target[idx]
    arr_2[0,] = word_context[idx]
    arr_3[0,] = labels[idx]
    loss = model.train_on_batch([arr_1, arr_2], arr_3)
    if cnt % 100 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
    if cnt % 10000 == 0:
        sim_cb.run_sim()

Iteration 0, loss=0.6237655282020569
Nearest to these: ants, ally, resolution, fiction, strongly, evangelical, bavaria, tiny,
Nearest to many: entitled, alaska, mary, wagner, remainder, terminals, differential, displacement,
Nearest to so: tends, shell, merit, methods, era, friendly, spencer, condition,
Nearest to who: bwv, primitive, trial, parade, popularly, qualified, gradually, reliable,
Nearest to its: hadith, impose, allegedly, pbs, ambrose, necessary, join, neighbours,
Nearest to for: frank, franklin, tribal, memories, writes, classify, yahoo, outcome,
Nearest to has: battles, pp, portland, chemicals, stone, video, dolphin, anger,
Nearest to zero: northwestern, precious, pronunciation, treatment, theorists, monopoly, weight, gardens,
Nearest to had: codex, floating, unification, boys, franklin, richard, identification, indonesian,
Nearest to are: synonym, goths, james, telling, experiments, ants, addresses, indonesia,
Nearest to five: domains, quest, beautiful, emperors, asimov,

Iteration 13400, loss=0.7108515501022339
Iteration 13500, loss=0.6933519840240479
Iteration 13600, loss=0.6776277422904968
Iteration 13700, loss=0.6722545027732849
Iteration 13800, loss=0.7045323252677917
Iteration 13900, loss=0.6901738047599792
Iteration 14000, loss=0.6697689890861511
Iteration 14100, loss=0.6435079574584961
Iteration 14200, loss=0.6717710494995117
Iteration 14300, loss=0.7310406565666199
Iteration 14400, loss=0.6139265298843384
Iteration 14500, loss=0.6975826025009155
Iteration 14600, loss=0.7188666462898254
Iteration 14700, loss=0.7240556478500366
Iteration 14800, loss=0.7065849304199219
Iteration 14900, loss=0.6840465068817139
Iteration 15000, loss=0.6966611742973328
Iteration 15100, loss=0.6271365880966187
Iteration 15200, loss=0.7349530458450317
Iteration 15300, loss=0.678281843662262
Iteration 15400, loss=0.7253926992416382
Iteration 15500, loss=0.6726604700088501
Iteration 15600, loss=0.686711847782135
Iteration 15700, loss=0.7136297821998596
Iteration 15800, l

KeyboardInterrupt: 