In [1]:
# word2vec
# words -> vectors
# eg:  king - man + woman = queen

# CBOW ( continuous bag of words ) 
#  algo to generate vectors from words ( another one is skipgram ) 

# objective : predict target word from context words ( exact opposite is dont in skipgram ) 


In [5]:
# Data Preparation
import pandas as pd
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical, pad_sequences
import numpy as np
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from sklearn.metrics.pairwise import euclidean_distances

In [6]:
data = [
    "Natural Language Processing is a field of Artificial Intelligence.",
    "Word embeddings help computers understand human language.",
    "The CBOW model is a part of Word2Vec technique.",
    "CBOW predicts the target word using surrounding context words.",
    "Skip Gram is another architecture of Word2Vec.",
    "Word2Vec is widely used in NLP applications.",
    "Embedding layers in deep learning are used to represent words.",
    "CBOW is faster and works better with frequent words."
]

In [7]:
#Tokenize and build vocabulary
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)

# word2id is a dictionary of {word1 : id1 , word2 : id2 ... } 
word2id = tokenizer.word_index
word2id['PAD'] = 0   # manually add a special "padding" token with an ID of 0 | used later to make sure all context windows have the same size
id2word = {v: k for k, v in word2id.items()} # exact ulta of word2id 

# Convert sentences into sequences of IDs
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]


vocab_size = len(word2id) # total number of unique words
embed_size = 100 # Dimensionality of our word embeddings. Each word will be represented by a vector of 100 numbers
window_size = 2  # Context window size
# Key parameter for CBOW. It means we will use 2 words to the left and 2 words to the right of a target word as its context

print("Vocabulary Size:", vocab_size)
print("Sample Vocabulary:", list(word2id.items())[:10])

Vocabulary Size: 50
Sample Vocabulary: [('is', 1), ('of', 2), ('cbow', 3), ('word2vec', 4), ('words', 5), ('language', 6), ('a', 7), ('word', 8), ('the', 9), ('used', 10)]


In [8]:
# Generate training data (context -> target pairs)

# This function is the core of the data generation process for CBOW
# It slides a "window" across each sentence to create pairs of (context words, target word).

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word = []            
            start = index - window_size
            end = index + window_size + 1
            
            # pick context (excluding target word)
            context_words.append([words[i] 
                                  for i in range(start, end) 
                                  if 0 <= i < sentence_length and i != index])
            label_word.append(word)

            # pad context & one-hot target
            x = pad_sequences(context_words, maxlen=context_length)
            y = to_categorical(label_word, vocab_size)
            yield (x, y)

In [14]:
# Show few examples of context ,  target pairs
i = 0
for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
    if 0 not in x[0]:  # skip padded ones
        print("Context (X):", [id2word[w] for w in x[0]], "-> Target (Y):", id2word[np.argmax(y[0])])
        i += 1
        if i == 5:
            break

Context (X): ['natural', 'language', 'is', 'a'] -> Target (Y): processing
Context (X): ['language', 'processing', 'a', 'field'] -> Target (Y): is
Context (X): ['processing', 'is', 'field', 'of'] -> Target (Y): a
Context (X): ['is', 'a', 'of', 'artificial'] -> Target (Y): field
Context (X): ['a', 'field', 'artificial', 'intelligence'] -> Target (Y): of


In [10]:
#Build CBOW model
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_shape=(window_size*2,)))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation="softmax"))
cbow.compile(loss="categorical_crossentropy", optimizer="adam")

print(cbow.summary())


# Embedding layer : layer is basically the inpput part of CBOW , the sliding window : this layer outputs the W axb matrix
#  where a = window size and ,b= size of hidden layer , 
#  the middle layer needs W axb matrix for each sliding window as we see in architecture.

# Embedding Layer: This is the heart of the model. It's a lookup table where the model will learn the vector representation for each word
    # input_dim=vocab_size: It knows there are 50 unique words.
    # output_dim=embed_size: It will represent each word as a 100-dimensional vector.
    # input_shape=(window_size*2,): It expects an input of 4 context words for each sample.


#  this is the middle hidden layer : which has size = dimension of vector embedding we want 
# Lambda Layer: This layer performs the "Bag-of-Words" part. It takes the embeddings of the 4 context words 
# and simply averages them to create a single 100-dimensional context vector. K.mean(x, axis=1) does this averaging.


# final output layer , also size of the vocabulary to predict the correct one hot encoded output matrix , via passing through softmax

# Dense Layer: This is the final output layer. It takes the single averaged context vector and tries to predict the target word.
    # vocab_size: The output has 50 neurons, one for each word in the vocabulary.
    # activation="softmax": This activation function converts the output into a probability distribution, 
    # giving the probability of each word being the correct target

# compile: This step configures the model for training.
    # loss="categorical_crossentropy": This is the appropriate loss function when your output is a probability distribution (from softmax)
    # optimizer="adam": A standard, effective algorithm for updating the model's weights during training

  super().__init__(**kwargs)


None


In [11]:
#Train Model
for epoch in range(1, 10):  # run fewer epochs for demo
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
        loss += cbow.train_on_batch(x, y)
        i += 1
    print("Epoch:", epoch, "Loss:", loss)

Epoch: 1 Loss: 261.8962821960449
Epoch: 2 Loss: 260.9323539733887
Epoch: 3 Loss: 259.1055529117584
Epoch: 4 Loss: 257.0529055595398
Epoch: 5 Loss: 254.66983246803284
Epoch: 6 Loss: 251.8908874988556
Epoch: 7 Loss: 248.6899390220642
Epoch: 8 Loss: 245.07982182502747
Epoch: 9 Loss: 241.10335493087769


In [16]:
#Save trained word embeddings to a file
weights = cbow.get_weights()[0] # weights from the embedding layer , these weights are used to get the word embeddings
weights = weights[1:]
print(weights.shape)


# display 5 word's embeddings
pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(49, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
of,-0.054321,-0.074739,-0.060069,0.078936,-0.089854,-0.028408,0.097059,0.090719,0.034698,0.029365,...,-0.011336,-0.00309,0.155461,0.004605,0.068141,-0.115233,0.074386,0.160993,-0.035914,0.043878
cbow,-0.093362,0.037589,-0.085925,-0.025712,-0.161978,0.041992,0.171742,0.110891,0.051549,0.245398,...,-0.094618,-0.109927,0.142211,-0.250707,0.094942,0.013233,0.227027,0.115982,-0.042515,-0.106139
word2vec,-0.032512,-0.148713,0.09306,-0.080591,-0.105891,-0.017151,-0.044656,-0.06636,-0.140784,-0.121386,...,-0.192467,-0.171513,-0.04646,0.079466,-0.155151,-0.080826,0.048995,0.086592,-0.068815,0.032962
words,-0.058049,-0.091064,-0.014859,0.103756,-0.001048,0.104799,-0.104455,0.0966,0.133464,0.060599,...,-0.15725,-0.124791,0.13366,-0.165393,-0.171648,-0.024072,-0.02857,0.025416,0.022717,0.087944
language,0.04378,0.023858,0.125315,-0.127571,0.134679,-0.09018,0.038102,-0.116379,-0.050548,0.152756,...,-0.070123,0.069304,-0.096214,-0.076106,-0.024002,0.068214,0.1438,-0.0572,0.064325,0.137079


In [17]:
# demonstrating that the learned embeddings have captured some semantic meaning. 

#Find similar words using Euclidean distance
distance_matrix = euclidean_distances(weights) # calculates the geometric distance between every pair of word vectors.

similar_words = {
    search: [id2word[idx] for idx in distance_matrix[word2id[search]-1].argsort()[1:6]+1]
    for search in ["deep", "cbow"]
}

print("Similar Words:", similar_words)

Similar Words: {'deep': ['embedding', 'learning', 'are', 'in', 'layers'], 'cbow': ['model', 'and', 'predicts', 'target', 'field']}
