In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers




In [2]:
%load_ext tensorboard
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
sentence = "I love mulch mulch gang for life i love ingesting microplastic"
tokens = list(sentence.lower().split())
print(len(tokens))
print(tokens)

11
['i', 'love', 'mulch', 'mulch', 'gang', 'for', 'life', 'i', 'love', 'ingesting', 'microplastic']


here we create a dictionary object that maps vocab to integer idices


In [4]:
vocab = {}

vocab["<pad>"] = 0

index = 1
for token in tokens: 
    if token not in vocab:
        vocab[token] = index
        index +=1

print(vocab)
vocab_size = len(vocab)

{'<pad>': 0, 'i': 1, 'love': 2, 'mulch': 3, 'gang': 4, 'for': 5, 'life': 6, 'ingesting': 7, 'microplastic': 8}


and an inverse dictionary index -> word

In [5]:
inverse_vocab = {}
for token, index in vocab.items():
    inverse_vocab[index] = token

print(inverse_vocab)

{0: '<pad>', 1: 'i', 2: 'love', 3: 'mulch', 4: 'gang', 5: 'for', 6: 'life', 7: 'ingesting', 8: 'microplastic'}


vectorize sentence

In [6]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 3, 4, 5, 6, 1, 2, 7, 8]


# generating skip gram word pairs

In [7]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

38
(6, 1): (life, i)
(5, 6): (for, life)
(4, 5): (gang, for)
(2, 8): (love, microplastic)
(3, 4): (mulch, gang)


# generating negative samples

In [8]:
target_word, context_word = positive_skip_grams[0]
num_ns = 5


context_class = tf.reshape(tf.constant(context_word, dtype = "int64"), (1,1))

negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])
print(context_class)

tf.Tensor([3 1 5 2 4], shape=(5,), dtype=int64)
['mulch', 'i', 'for', 'love', 'gang']
tf.Tensor([[1]], shape=(1, 1), dtype=int64)


## constructing a training ex
concat one context word w negative samples and label them.1 - context, 0 - ns

In [9]:
squeezed_context_class = tf.squeeze(context_class, 1)
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0) #one positive ex, num_ns negative ex concated 
label = tf.constant([1] + [0]*num_ns, dtype="int64")
target = target_word

# compiling a function that constructs training examples

In [10]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    targets, contexts, labels = [],[],[]

    for sequence in tqdm.tqdm(sequences):
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
        example_sequence,
        vocabulary_size=vocab_size,
        window_size=window_size,
        negative_samples=0)

        for target_word, context_word in positive_skip_grams:
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
            true_classes=context_class,  # class that should be sampled as 'positive'
            num_true=1,  # each positive skip-gram has 1 positive context class
            num_sampled=num_ns,  # number of negative context words to sample
            unique=True,  # all the negative samples should be unique
            range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
            seed=SEED,  # seed for reproducibility
            name="negative_sampling"  # name of this operation
            )

            squeezed_context_class = tf.squeeze(context_class, 1)
            context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0) #one positive ex, num_ns negative ex concated 
            label = tf.constant([1] + [0]*num_ns, dtype="int64")
            target = target_word

            targets.append(target)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

            

            

# loading text

In [11]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt') 
#fetch file from uyrl and return local file path
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))
#parse file into a TextLineDataset object

# preprocessing
standardize vector

In [12]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')  

vocab_size = 4096
sequence_length = 10

vectorize_layer = layers.TextVectorization(
  standardize = custom_standardization,
  max_tokens = vocab_size,
  output_mode = "int",
  output_sequence_length = sequence_length
)

vectorize_layer.adapt(text_ds.batch(1024))

text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()





this returns. a tf.data.Dataset object containing vectoried sequences. 
now we flatten into a list of vectors

In [13]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


# generate training sequences

In [14]:
targets, contexts, labels = generate_training_data(sequences, 5, 4, vocab_size, SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

  0%|          | 0/32777 [00:00<?, ?it/s]

100%|██████████| 32777/32777 [19:49<00:00, 27.55it/s] 


In [19]:
training_ds = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
training_ds = training_ds.cache().prefetch(AUTOTUNE)

# model and training

In [23]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(
            vocab_size = vocab_size, embedding_dim = embedding_dim, input_length = 1, name = "target layer" 
        )
        self.context_embedding = layers.Embedding(
            vocab_size = vocab_size, embedding_dim = embedding_dim, input_length = num_ns+1, name = "context layer" 
        )

    def call(self, pair):
        target, context = pair

        if len(target.shape) == 2:
            target = tf.squeeze(target, 1)

        target_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)

        dots = tf.einsum('be,bce->bc', target_emb, context_emb)
        return dots


In [None]:
word2vec = Word2Vec(vocab_size, 128)
word2vec.compile(optimizer="adam", loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True), metrics="accuracy")

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

word2vec.fit(training_ds, epoch = 5, callback = [tensorboard_callback])