In [3]:
import io
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [4]:
%load_ext tensorboard

In [6]:
SEED = 42
AUTOTUNE =tf.data.AUTOTUNE

In [7]:
sentence = "The wide road shimmered in the hot sun"
tokens =  list(sentence.lower().split())
print(len(tokens))

8


In [8]:
tokens

['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']

In [9]:
vocab,index={},1
vocab['<pad>'] = 0
for token in tokens:
  if token not in vocab:
    vocab[token]=index
    index+=1
vocab_size=len(vocab)
print(vocab )

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [10]:
inverse_vocab={idx: token for token,idx in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [11]:
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [13]:
window_size=2
positive_skip_grams,_=tf.keras.preprocessing.sequence.skipgrams(example_sequence,vocabulary_size=vocab_size,window_size=window_size,negative_samples=0)

In [18]:
skip_grams=[]
for i in range(len(positive_skip_grams)):
  context=inverse_vocab[positive_skip_grams[i][0]]
  target=inverse_vocab[positive_skip_grams[i][1]]
  skip_grams.append((context,target))

In [19]:
skip_grams

[('road', 'in'),
 ('wide', 'shimmered'),
 ('shimmered', 'the'),
 ('road', 'wide'),
 ('shimmered', 'in'),
 ('sun', 'the'),
 ('wide', 'the'),
 ('in', 'shimmered'),
 ('the', 'hot'),
 ('road', 'the'),
 ('the', 'road'),
 ('in', 'the'),
 ('the', 'shimmered'),
 ('in', 'hot'),
 ('shimmered', 'wide'),
 ('hot', 'sun'),
 ('wide', 'road'),
 ('the', 'in'),
 ('the', 'sun'),
 ('hot', 'the'),
 ('hot', 'in'),
 ('sun', 'hot'),
 ('road', 'shimmered'),
 ('in', 'road'),
 ('shimmered', 'road'),
 ('the', 'wide')]

In [20]:
sentence

'The wide road shimmered in the hot sun'

In [21]:
#negative sampling for skipgram

In [22]:
target_word,context_word = positive_skip_grams[0]

num_ns=4

context_class =tf.reshape(tf.constant(context_word,dtype="int64"),(1,1))

In [23]:
context_class


<tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[5]])>

In [24]:
negative_sampling_candidates,_,_=tf.random.log_uniform_candidate_sampler(true_classes=context_class,num_true=1,num_sampled=num_ns,unique=True,range_max=vocab_size,seed=SEED,name="negative_Sampling")

In [25]:
print(negative_sampling_candidates)

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)


In [29]:
for i in negative_sampling_candidates:
  print(inverse_vocab[i.numpy()])

wide
the
shimmered
road


In [30]:
print(inverse_vocab[context_word])

in


In [32]:
squeezed_context_class = tf.squeeze(context_class,1)

In [33]:
squeezed_context_class

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([5])>

In [34]:
context=tf.concat([squeezed_context_class,negative_sampling_candidates],0)

In [35]:
context

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([5, 2, 1, 4, 3])>

In [39]:
labels=tf.constant([1]+[0]*num_ns,dtype="int64")

In [40]:
labels

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>

In [42]:
target=target_word

In [43]:
target

3

In [45]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {labels}")

target_index    : 3
target_word     : road
context_indices : [5 2 1 4 3]
context_words   : ['in', 'wide', 'the', 'shimmered', 'road']
label           : [1 0 0 0 0]


In [47]:
print("target  :", target)
print("context :", context)
print("label   :", labels)

target  : 3
context : tf.Tensor([5 2 1 4 3], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [48]:
sampling_table=tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


In [73]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels


In [51]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt','https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt


In [52]:
with open(path_to_file) as f:
  lines=f.read().splitlines()


for line in lines[:20]:
  print(line)

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.


In [53]:
text_ds=tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x),bool))

In [54]:
text_ds

<_FilterDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [62]:
vocab_size=4096
sequence_length = 10
vectorize_layer=layers.TextVectorization(standardize=coustom_standarization,max_tokens=vocab_size,output_mode='int',output_sequence_length=sequence_length)

In [63]:
def coustom_standarization(input_data):
   lowercase = tf.strings.lower(input_data)
   return tf.strings.regex_replace(lowercase,'[%s]'% re.escape(string.punctuation),'')

In [64]:
vectorize_layer.adapt(text_ds.batch(1024))

In [66]:
inverse_vocab=vectorize_layer.get_vocabulary()

In [68]:
inverse_vocab[:100]

['',
 '[UNK]',
 'the',
 'and',
 'to',
 'i',
 'of',
 'you',
 'my',
 'a',
 'that',
 'in',
 'is',
 'not',
 'for',
 'with',
 'me',
 'it',
 'be',
 'your',
 'his',
 'this',
 'but',
 'he',
 'have',
 'as',
 'thou',
 'him',
 'so',
 'what',
 'thy',
 'will',
 'no',
 'by',
 'all',
 'king',
 'we',
 'shall',
 'her',
 'if',
 'our',
 'are',
 'do',
 'thee',
 'now',
 'lord',
 'good',
 'on',
 'o',
 'come',
 'from',
 'sir',
 'or',
 'which',
 'more',
 'then',
 'well',
 'at',
 'would',
 'was',
 'they',
 'how',
 'here',
 'she',
 'than',
 'their',
 'them',
 'ill',
 'duke',
 'am',
 'hath',
 'say',
 'let',
 'when',
 'one',
 'go',
 'were',
 'love',
 'may',
 'us',
 'make',
 'upon',
 'yet',
 'richard',
 'like',
 'there',
 'must',
 'should',
 'an',
 'first',
 'why',
 'queen',
 'had',
 'know',
 'man',
 'did',
 'tis',
 'where',
 'see',
 'some']

In [87]:

text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [70]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

32777


In [71]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']
[138  36 982 144 673 125  16 106   0   0] => ['before', 'we', 'proceed', 'any', 'further', 'hear', 'me', 'speak', '', '']
[34  0  0  0  0  0  0  0  0  0] => ['all', '', '', '', '', '', '', '', '', '']
[106 106   0   0   0   0   0   0   0   0] => ['speak', 'speak', '', '', '', '', '', '', '', '']
[ 89 270   0   0   0   0   0   0   0   0] => ['first', 'citizen', '', '', '', '', '', '', '', '']


In [74]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


100%|██████████| 32777/32777 [00:25<00:00, 1278.89it/s]




targets.shape: (64730,)
contexts.shape: (64730, 5)
labels.shape: (64730, 5)


In [75]:
targets

array([3690, 3690, 1286, ...,  129,  129, 1049])

In [76]:
contexts

array([[   4,  170,   19,  826,   57],
       [  64,    1,   20,   84,   11],
       [1286,  280, 1633,    3,    9],
       ...,
       [  26,   94,    9,   10,    0],
       [1874,  112,    2,   32,  655],
       [  26,   12,  133, 2260, 4059]])

In [77]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<_BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [78]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<_PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [79]:
class Word2Vec(tf.keras.Model):

  def __init__(self,vocab_size,embedding_dim):
    super(Word2Vec,self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,embedding_dim,
                                             input_length=1,
                                             name="w2v_embedding")
    self.context_embedding=layers.Embedding(vocab_size,embedding_dim,input_length=num_ns+1)




  def call(self,pair):
    target,context=pair

    if len(target.shape)==2:
      target = tf.squeeze(target,axis=1)

    word_emb=self.target_embedding(target)

    context_emb=self.context_emb(context)

    dots = tf.einsum('be,bce->bc',word_emb,context_emb)
    return dots


In [80]:


def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [81]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [82]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [85]:
Word2Vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

AttributeError: ignored