In [1]:
pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 16.3 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [2]:
import re
import tensorflow as tf
import numpy as np
import math
import tensorflow
import tensorflow_text as tf_text
import tqdm.notebook as note
import scipy
import time

start = time.time()

In [3]:
# read bible text
bible = open("/content/drive/MyDrive/data/bible.txt", "r")
data = bible.read()

In [4]:
# look at first 80 characters
print(data[0:80])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the 


In [5]:
# preprocessing: lower characters and no special chars or numbers
data = data.lower()
data = re.sub(r"[^a-z ]", "", data)

In [6]:
# look at first 80 characters after preprocessing
print(data[0:80])

the first book of moses  called genesis in the beginning god created the heaven 


In [7]:
# create a smaller sample to see if everything works
smaller_data = data[:400000]

In [8]:
# create date set

tokens = list(smaller_data.split())

vocab, index = {}, 1  
vocab['<pad>'] = 0
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)

In [9]:
inverse_vocab = {index: token for token, index in vocab.items()}

In [10]:
sequence = [vocab[word] for word in tokens]

In [11]:
print(len(sequence))

75879


In [12]:
# size of sampling windows (technically half-window). The window of a word w_i will be [i - window_size, i + window_size+1].
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=1)
print(len(positive_skip_grams))

607020


In [13]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(369, 1511): (did, many)
(5772, 1693): (rump, outunto)
(1446, 2399): (pieces, isa)
(102, 4888): (two, habitation)
(549, 97): (begat, years)


In [14]:
BATCH_SIZE = 128 

smaller_dataset = positive_skip_grams
smaller_dataset = np.array(smaller_dataset)[:(len(smaller_dataset)//(BATCH_SIZE))*BATCH_SIZE]

smaller_dataset =tf.data.Dataset.from_tensor_slices((smaller_dataset[:,0],smaller_dataset[:,1]))

In [15]:
smaller_dataset = smaller_dataset.shuffle(1000).batch(128).prefetch(64)

smaller_dataset = smaller_dataset.map(lambda a,b: (a,tf.reshape(b,(-1,1))))

In [16]:
for elem in smaller_dataset:
  print(elem)
  break

(<tf.Tensor: shape=(128,), dtype=int64, numpy=
array([  13,   44,  214,  364,    8,  328,    1,   13, 1276,  150,   99,
          1,  201,   13,  205,   34, 3716,   27,  173,  281,  677,    1,
         80,   13,  466,    1,   10, 1998, 1225,  356,  506,   99,   39,
        205,   50,   44,  145,  834,   65, 1569,  314,  156,   13,  163,
       1964,  855,  609,  388,  547,  393,   27,  455,   33, 2210,    4,
        238,   34,  176,   21,   27,  393, 1324,   13,    4, 1254,  364,
       5318,    8,  864,  906,  988,  361,   13,   30, 5412,   13,   34,
       6065,    1,  497,  477,   13, 1159,  286,  209,  679,   39,  512,
       1819,  469,    1, 3277,   43,   39,   13,  238,   13,  124,    1,
          4,  425,  330,   13,    4, 4750,  393, 1301, 2071,    1,  463,
        538,   30,  173,   13,  178,   93,   13, 1296,   13,   13,    1,
        176, 4331,  479,   56,  205,    1,   52])>, <tf.Tensor: shape=(128, 1), dtype=int64, numpy=
array([[2113],
       [2616],
       [ 767],
     

In [17]:
# implement SkipGram model

class SkipGram(tf.keras.Model):
  
  def __init__(self, vocabulary_size, embedding_size):
    super(SkipGram, self).__init__()
    
    self.vocabulary_size = vocabulary_size
    self.embedding_size = embedding_size

  def build(self,input_shape):
    self.embedding_matrix = tf.Variable(tf.random.uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))
  
    self.score_weights = tf.Variable(tf.random.truncated_normal([self.vocabulary_size, self.embedding_size],stddev=1.0 / math.sqrt(self.embedding_size)))
    
    self.nce_biases = tf.Variable(tf.zeros([self.vocabulary_size])) 

  def call(self, words, labels):
    
    embed = tf.nn.embedding_lookup(self.embedding_matrix, words)

    loss = tf.nn.nce_loss(weights=self.score_weights,biases=self.nce_biases, labels=labels,  inputs=embed, num_sampled=1 ,num_classes=self.vocabulary_size)
    
    return tf.reduce_mean(loss)

In [21]:
def nearest_words(Model, words_of_interest):
    embed = Model.embedding_matrix.numpy()
    
    for word in words_of_interest:

        idx = vocab[word]

        closest = np.argmin([scipy.spatial.distance.cosine(embed[idx],embed[q]) if q!=idx else np.inf for q in range(len(embed))])

        print(f"Closest to {word} is {inverse_vocab[closest]}")

In [22]:
tf.keras.backend.clear_session()

interesting_words = ["holy", "father", "wine", "spirit", "love", "strong", "day"]

num_epochs = 6
hidden_size = 64

VOCAB_SIZE = vocab_size
Model = SkipGram(VOCAB_SIZE,hidden_size)

optimizer = tf.optimizers.Adam(0.001)    

train_losses = np.empty(0)

start_time = time.time()

for epoch in range(num_epochs):
    print(f'Epoch {str(epoch)}')

    epoch_loss_agg = np.empty(0)

    for input, target in note.tqdm(smaller_dataset,position=0,leave=True):
      
      with tf.GradientTape() as tape:
        loss = Model(input,target) 
       
      gradients = tape.gradient(loss, Model.trainable_variables)
      optimizer.apply_gradients(zip(gradients, Model.trainable_variables))
      epoch_loss_agg = np.append(epoch_loss_agg, loss)
                   
    train_losses = np.append(train_losses, tf.reduce_mean(epoch_loss_agg))            
                   
    used_time = np.round(time.time()-start_time)
    print(f"Finished epoch {epoch+1}, it took {used_time //60} minutes, {used_time%60} seconds")
    print(train_losses[-1])
    nearest_words(Model,interesting_words)
    start_time = time.time()

Epoch 0


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 1, it took 2.0 minutes, 22.0 seconds
5.756366408057275
Closest to holy is all
Closest to father is for
Closest to wine is thewilderness
Closest to spirit is hethat
Closest to love is headand
Closest to strong is grow
Closest to day is ye
Epoch 1


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 2, it took 2.0 minutes, 22.0 seconds
4.598025658962528
Closest to holy is if
Closest to father is died
Closest to wine is thewilderness
Closest to spirit is every
Closest to love is generations
Closest to strong is grow
Closest to day is ye
Epoch 2


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 3, it took 1.0 minutes, 34.0 seconds
4.211199457202152
Closest to holy is if
Closest to father is before
Closest to wine is thewilderness
Closest to spirit is every
Closest to love is stuff
Closest to strong is grow
Closest to day is fear
Epoch 3


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 4, it took 1.0 minutes, 34.0 seconds
3.7820831363975924
Closest to holy is if
Closest to father is before
Closest to wine is thewilderness
Closest to spirit is every
Closest to love is stuff
Closest to strong is grow
Closest to day is fear
Epoch 4


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 5, it took 2.0 minutes, 22.0 seconds
3.5338860075066094
Closest to holy is if
Closest to father is before
Closest to wine is thewilderness
Closest to spirit is every
Closest to love is stuff
Closest to strong is grow
Closest to day is ye
Epoch 5


  0%|          | 0/4742 [00:00<?, ?it/s]

Finished epoch 6, it took 1.0 minutes, 35.0 seconds
3.499440975150212
Closest to holy is if
Closest to father is before
Closest to wine is thewilderness
Closest to spirit is every
Closest to love is stuff
Closest to strong is grow
Closest to day is fear
