# Word2Vec: How to do text embedding

In a SkipGram Model, the NN attempts to predict the context (or neighbors) of a word, given the list itself. <br>
The training data consists of skipgrams, which are so called n-grams that allow tokens to be skipped. <br>
A word context can be visualized as a set of skipgram pairs of (target_word, context_word), context_word appears in the neighboring context of target_word <br>

<strong>Example: The wide road shimmered in the hot sun</strong><br>

<strong>Target:</strong> wide <br>
<strong>Predictions:</strong> The, road, shimmered <br>

In [11]:
import tensorflow as tf
import keras
import numpy as np
import io
import tqdm
import re
import string

from keras import layers, utils, Model
from keras.preprocessing.sequence import skipgrams

In [4]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [5]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

### Vectorization of an sample Sentence

In [13]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

vocab, index = {}, 1
vocab["<pad>"] = 0

for token in tokens: 
    if token not in vocab:
        vocab[token] = index
        index += 1

vocab_size = len(vocab)
print(vocab)

inverse_vocab = { index: token for token, index in vocab.items() }
print(inverse_vocab)

8
{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}
{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [9]:
sequence = [vocab[word] for word in tokens]
print(sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


### Generate Skipgrams for the sample sentence

In [15]:
WINDOW_SIZE = 2

positive_skipgrams, _ = skipgrams(sequence=sequence, vocabulary_size=vocab_size, window_size=WINDOW_SIZE, negative_samples=0)
print(len(positive_skipgrams))

for target, context in positive_skipgrams:
    print(f"({target}, {context}): ({inverse_vocab[target]} {inverse_vocab[context]})")

26
(5, 1): (in the)
(3, 5): (road in)
(5, 6): (in hot)
(1, 5): (the in)
(1, 3): (the road)
(2, 4): (wide shimmered)
(5, 3): (in road)
(1, 6): (the hot)
(2, 1): (wide the)
(7, 6): (sun hot)
(1, 2): (the wide)
(3, 2): (road wide)
(4, 1): (shimmered the)
(3, 4): (road shimmered)
(2, 3): (wide road)
(6, 7): (hot sun)
(7, 1): (sun the)
(4, 2): (shimmered wide)
(4, 3): (shimmered road)
(1, 4): (the shimmered)
(6, 5): (hot in)
(6, 1): (hot the)
(4, 5): (shimmered in)
(5, 4): (in shimmered)
(3, 1): (road the)
(1, 7): (the sun)


### Generate Negative Skipgrams

In [22]:
target, context = positive_skipgrams[0]
num_ns = 4
context_class = tf.reshape(tf.constant(context, dtype="int64"), (1, 1))

negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,
    num_true=1,
    num_sampled=num_ns,
    unique=True,
    range_max=vocab_size,
    seed=SEED,
    name="negative_sampling"
)

print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([6 3 2 4], shape=(4,), dtype=int64)
['hot', 'road', 'wide', 'shimmered']


### Construction of 1 Training Set <br>
Now, the positive skip grams along with the negative skip grams are being pushed into one Tensor, which will form a single training set. 

In [28]:
squeezed_context_class = tf.squeeze(context_class, 1)
full_context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)
label = tf.constant([1] + [0] * num_ns, dtype="int64")

print(f"Target Index: {target}")
print(f"Target Word: {inverse_vocab[target]}")
print(f"Context Indices: {context}")
print(f"Context Words: {[inverse_vocab[c.numpy()] for c in full_context]}")
print(f"Label: {label}")

Target Index: 5
Target Word: in
Context Indices: 1
Context Words: ['the', 'hot', 'road', 'wide', 'shimmered']
Label: [1 0 0 0 0]


=> Tuple of (target, context, label) makes one training data

In [26]:
single_training_data = (target, full_context, label)
print(single_training_data)

(5, <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 6, 3, 2, 4], dtype=int64)>, <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>)


### Get the Dataset

In [29]:
filepath = utils.get_file("shakespeare.txt", "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [30]:
with open(filepath) as f:
    lines = f.read().splitlines()

for line in lines[:20]:
    print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [31]:
text_dataset = tf.data.TextLineDataset(filepath).filter(lambda x: tf.cast(tf.strings.length(x), bool))

### Generate training data 

In [44]:
from keras.preprocessing.sequence import make_sampling_table

def GenerateTrainingData(sequences, window_size, num_ns, vocab_size, seed):
    targets, contexts, labels = [], [], []

    # Generate sampling table
    sampling_table = make_sampling_table(vocab_size)

    # Go through all sequences in the dataset
    for sequence in tqdm.tqdm(sequences):
        # Get the positive skipgrams
        positive_skipgrams, _ = skipgrams(sequence, vocabulary_size=vocab_size, sampling_table=sampling_table, window_size=window_size, negative_samples=0)

        # get negative samples aswel
        for target, context in positive_skipgrams:
            context_class = tf.expand_dims(tf.constant([context], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling"
            )

            # construct training data
            full_context = tf.concat([tf.squeeze(context_class, 1), negative_sampling_candidates], 0)
            label = tf.constant([1] + [0] * num_ns, dtype="int64")

            # return training data
            targets.append(target)
            contexts.append(full_context)
            labels.append(label)
    
    return targets, contexts, labels

### Sentence Vectorization

In [32]:
def CustomStandardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(string.punctuation), "")

VOCAB_SIZE = 4096
SEQUENCE_LENGTH = 10

vectorization_layer = layers.TextVectorization(standardize=CustomStandardization, max_tokens=VOCAB_SIZE, output_mode="int", output_sequence_length=SEQUENCE_LENGTH)
vectorization_layer.adapt(text_dataset.batch(1024))

In [33]:
inverse_vocab = vectorization_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [35]:
text_vector_dataset = text_dataset.batch(1024).prefetch(AUTOTUNE).map(vectorization_layer).unbatch()
sequences = list(text_vector_dataset.as_numpy_iterator())
print(len(sequences))

for seq in sequences[:10]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

32777
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[   7   41   34 1286  344    4  200   64    4 3690] => ['you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[1286 1286    0    0    0    0    0    0    0    0] => ['resolved', 'resolved', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[  89    7   93 1187  225   12 2442  592    4    2] => ['first', 'you', 'kno

### Generate training data

In [46]:
targets, contexts, labels = GenerateTrainingData(sequences=sequences, window_size=2, num_ns=4, vocab_size=VOCAB_SIZE, seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print(targets)
print(contexts)
print(labels)

100%|██████████| 32777/32777 [00:12<00:00, 2658.72it/s]


[ 270   89  982 ... 1049 1049 1874]
[[  89   45 1468   18  671]
 [ 270    0  279    4 2861]
 [  36  564   30    1   42]
 ...
 [ 129   31    1   21  370]
 [  26    4   52 2271  437]
 [ 129  182  214  121   53]]
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 ...
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]]


### Configuring the Dataset for performance

In [49]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


### Word2Vec Model

In [50]:
class Word2Vec(keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()

        self.targeting_embedding = layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size, embedding_dim, input_length=num_ns + 1)

    def call(self, pair):
        target, context = pair

        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        
        word_embedding = self.targeting_embedding(target)
        context_embedding = self.context_embedding(context)

        dots = tf.einsum("be,bce->bc", word_embedding, context_embedding)

        return dots
    

### Defining the Loss

In [51]:
def W2VLoss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

### Compling the Model + Summary

In [56]:
from keras import losses

embedding_dim = 128
word2vec = Word2Vec(VOCAB_SIZE, embedding_dim=embedding_dim)
word2vec.compile(optimizer="adam", loss=losses.CategoricalCrossentropy(from_logits=True), metrics=["accuracy"])

In [61]:
tensorboard_callback = keras.callbacks.TensorBoard(log_dir="logs")

### Training

In [60]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2498bd06920>

In [None]:
%tensorboard --logdir logs

### Get Predictions

In [64]:
weights = word2vec.get_layer("w2v_embedding").get_weights()[0]
vocab = vectorization_layer.get_vocabulary()

out_vectors = io.open("vectors.tsv", "w", encoding="utf-8")
out_metadata = io.open("metadata.tsv", "w", encoding="utf-8")

for index, word in enumerate(vocab):
    if index == 0:
        continue

    vector = weights[index]
    out_vectors.write("\t".join([str(x) for x in vector]) + "\n")
    out_metadata.write(word + "\n")

out_vectors.close()
out_metadata.close()