# Pietro Servadio, Vasu Bansal

Answers at the bottom

In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [5]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [6]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [7]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [8]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

26


In [9]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(6, 7): (hot, sun)
(4, 5): (shimmered, in)
(4, 2): (shimmered, wide)
(1, 5): (the, in)
(5, 6): (in, hot)


In [10]:
positive_skip_grams

[[6, 7],
 [4, 5],
 [4, 2],
 [1, 5],
 [5, 6],
 [3, 2],
 [3, 5],
 [4, 1],
 [1, 2],
 [3, 1],
 [1, 6],
 [1, 4],
 [5, 4],
 [6, 1],
 [2, 1],
 [1, 3],
 [5, 3],
 [4, 3],
 [1, 7],
 [7, 1],
 [3, 4],
 [2, 4],
 [2, 3],
 [6, 5],
 [5, 1],
 [7, 6]]

In [11]:
target_word, context_word = positive_skip_grams[0]
print(target_word)

6


In [12]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


In [13]:
context_class

<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]])>

In [14]:
# Reduce a dimension so you can use concatenation (in the next step).
squeezed_context_class = tf.squeeze(context_class, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word


In [15]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 6
target_word     : hot
context_indices : [7 2 1 4 3]
context_words   : ['sun', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [16]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : 6
context : tf.Tensor([7 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [17]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [18]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    couples = positive_skip_grams
    for target_word, context_word in positive_skip_grams:

      couples2 = couples
      context_words = [target_word]
      for tw, cw in couples2:     # This cycle analyses all the context words in the positive skipgram array relative to a target word
        if tw == target_word:     # and saves them in the array "context_words". Also saves the target word itself.
          context_words.append(cw)
      #print(context_words)

      while True:                 #

        context_class = tf.expand_dims(
            tf.constant([context_word], dtype="int64"), 1)
        negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
            true_classes=context_class,
            num_true=1,
            num_sampled=num_ns,
            unique=True,
            range_max=vocab_size,
            seed=seed,
            name="negative_sampling")

        ns_candidates_numpy = negative_sampling_candidates.numpy()
        matching = False
        for ns in ns_candidates_numpy:    # This double cycle checks is there are matches between the positive skipgrams and the
          for cws in context_words:       # negative skipgrams. If there are not matches, meaning that the negative skipgrams do not
            if ns == cws:                 # contain the positive ones (nor the target word), the program moves on with
              matching = True             # the next pair (target_word, context_word)
        if not matching:
          break
        #print(negative_sampling_candidates.numpy())

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

  # To sum up the code just generates negative skipgrams until an instance of them does not contain positive skipgrams.
  # This method we applied is obviously based on probability and could take many iteration to generate the desired skipgrams,
  # however we only need to generate four negative skipgrams and the vocabulary size is 4096, so the odds are not terrible.
  # Furthemore we didn't need to drastically modify the code and not even to calculate the softmax.

In [19]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [20]:
with open(path_to_file) as f:
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [21]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [22]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [23]:
vectorize_layer.adapt(text_ds.batch(1024))

In [24]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'the', 'and', 'to', 'i', 'of', 'you', 'my', 'a', 'that', 'in', 'is', 'not', 'for', 'with', 'me', 'it', 'be', 'your']


In [25]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [26]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [27]:
print(sequences[:10])

[array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([138,  36, 982, 144, 673, 125,  16, 106,   0,   0]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([106, 106,   0,   0,   0,   0,   0,   0,   0,   0]), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([   7,   41,   34, 1286,  344,    4,  200,   64,    4, 3690]), array([34,  0,  0,  0,  0,  0,  0,  0,  0,  0]), array([1286, 1286,    0,    0,    0,    0,    0,    0,    0,    0]), array([ 89, 270,   0,   0,   0,   0,   0,   0,   0,   0]), array([  89,    7,   93, 1187,  225,   12, 2442,  592,    4,    2])]


In [28]:
for seq in sequences[:11]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[   7   41   34 1286  344    4  200   64    4 3690] => ['you', 'are', 'all', 'resolved', 'rather', 'to', 'die', 'than', 'to', 'famish']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[1286 1286    0    0    0    0    0    0    0    0] => ['resolved', 'resolved', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[  89    7   93 1187  225   12 2442  592    4    2] => ['first', 'you', 'know', 'c

In [29]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|██████████| 32777/32777 [00:25<00:00, 1295.32it/s]




targets.shape: (64577,)
contexts.shape: (64577, 5)
labels.shape: (64577, 5)


In [30]:
for i in targets:
  if i == 0:
    print("yay!")

In [31]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [32]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [33]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [34]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [35]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [36]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7933ec244040>

# Answers

1)	14. 2 for the first and last words, 3 for the second and the second last words and 4 for the middle word.

2)	In general, practice shows that for a big dataset it is best to keep the skip-gram window small, around 2 words each side. This implies a smaller number of skip-grams per word. Instead with small datasets it suits best a bigger window of up to 20 words per side, hence having more skip-grams per word.

3)	In the denominator of the softmax needs to be computed over the entire vocabulary which is computationally heavy.

4)	They are different each time because the log_uniform_candidate_sampler() function choses the samples randomly.

5)	No, the max length of a sentence is limited to 10 words, hence it is not even granted that a word within a sentence is used in the training dataset. Every line is treated separately to other lines not admitting “cross-sentence context”.

6)	The index zero (aka padding) is found in context vectors but not in the targets. This makes sense because a word can be typically used at the end of a sentence (or in the middle of it), hence more frequently near padding (or away from it).

7)The sampling table is a list of probabilities based word-frequency ranks. The function assumes a Zipf's distribution of the word frequencies for sampling, I.E sampling_table[i] denotes the probability of sampling the i-th most common word in dataset and therefore we do not need to make a reference to actual text data.