In [1]:
import tensorflow as tf
from tensorflow.keras import layers,datasets,preprocessing
import urllib
import collections
import os
import zipfile
from keras.preprocessing import sequence
import numpy as np

Using TensorFlow backend.


In [2]:
def maybe_download(filename, url, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words."""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

def collect_data(vocabulary_size=10000):
    url = 'http://mattmahoney.net/dc/'
    filename = maybe_download('text8.zip', url, 31344016)
    vocabulary = read_data(filename)
    data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                                vocabulary_size)
    del vocabulary  # Hint to reduce memory.
    return data, count, dictionary, reverse_dictionary




vocab_size = 10000
data, count, dictionary, reverse_dictionary = collect_data(vocabulary_size=vocab_size)
print(data[:7])

Found and verified text8.zip
[5234, 3081, 12, 6, 195, 2, 3134]


In [3]:
from keras.preprocessing.sequence import skipgrams
window_size = 5
epochs = 1000000


valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.

valid_examples = np.random.choice(valid_window, valid_size, replace=False)
valid_examples


sampling_table = sequence.make_sampling_table(vocab_size)

couples, labels = skipgrams(data, vocab_size, window_size=window_size, sampling_table=sampling_table)

word_target, word_context = zip(*couples)
word_target = np.array(word_target, dtype="int32")
word_context = np.array(word_context, dtype="int32")

print(couples[:10], labels[:10])

[[2229, 316], [5544, 4281], [1302, 831], [72, 370], [413, 2], [90, 5886], [3985, 527], [3954, 5], [3705, 2634], [630, 30]] [1, 0, 1, 1, 1, 0, 1, 1, 0, 1]


In [78]:
from keras.layers import dot,Input,Reshape
class embedding2(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embedding = tf.keras.layers.Dense(units=300,input_shape=(1,vocab_size))
        
    def call(self, inputs):
        word_vec = self.embedding(inputs)
        return word_vec
    
class model2(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.finalDense = tf.keras.layers.Dense(1, activation="sigmoid")
        
    def call(self, inputs):
        dot_product = dot([inputs[0], inputs[1]], axes=1, normalize=True)
        finalDense = self.finalDense(dot_product)
        return finalDense

In [79]:
w2vModel = embedding2()
finalmodel = model2()
learning_rate = 0.001
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

In [None]:
for cnt in range(epochs):
    with tf.GradientTape() as tape:
        idx = np.random.randint(0, len(labels)-1)
        
        arr_1 = tf.convert_to_tensor(word_target[idx])
        arr_2 = tf.convert_to_tensor(word_context[idx])
        arr_3 = tf.convert_to_tensor(labels[idx])
        
        arr_1 = tf.reshape( arr_1,(-1,1))
        arr_2 = tf.reshape( arr_2,(-1,1))
        arr_3 = tf.reshape( arr_3,(-1,1))
        
        word_vec = w2vModel(arr_1)
        context_vec = w2vModel(arr_2)
        
        y_pred = finalmodel([word_vec,context_vec])
        
        loss = tf.keras.losses.binary_crossentropy(arr_3, y_pred)
        
    grads = tape.gradient(loss, finalmodel.variables)
    optimizer.apply_gradients(grads_and_vars=zip(grads, finalmodel.variables))
    if cnt % 1000 == 0:
        print("Iteration {}, loss={}".format(cnt, loss))
        
    