In [10]:
# Import necessary libraries
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
import tensorflow.keras.backend as K
from sklearn.metrics.pairwise import euclidean_distances
import pandas as pd


In [11]:
# Data: list of sentences about deep learning
data = [
    "Deep learning also known as deep structured learning",
    "is part of a broader family of machine learning methods based",
    "on artificial neural networks with representation learning",
    "Learning can be supervised, semi-supervised or unsupervised",
    "Deep-learning architectures such as deep neural networks",
    "deep belief networks, deep reinforcement learning",
    "recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation",
    "where they have produced results comparable to and in some cases surpassing human expert performance"
]

# Tokenize sentences and build vocabulary
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(data)
word2id = tokenizer.word_index
word2id['PAD'] = 0  # Add padding for sequence compatibility
id2word = {v: k for k, v in word2id.items()}

# Convert sentences to sequences of word IDs
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in data]
vocab_size = len(word2id)
embed_size = 100  # Embedding vector size
window_size = 2  # Context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])


Vocabulary Size: 62
Vocabulary Sample: [('learning', 1), ('deep', 2), ('networks', 3), ('neural', 4), ('as', 5), ('of', 6), ('machine', 7), ('supervised', 8), ('and', 9), ('have', 10)]


In [12]:
# Function to generate context-target pairs
from tensorflow.keras.utils import to_categorical

def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            # Get context words
            context_words = [
                words[i]
                for i in range(index - window_size, index + window_size + 1)
                if 0 <= i < sentence_length and i != index
            ]
            # Pad context to fixed length
            x = pad_sequences([context_words], maxlen=context_length)[0]
            y = to_categorical(word, vocab_size)
            yield (x, y)

# Display a few context-target pairs
print("\nSample Context-Target Pairs:")
i = 0
for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
    print('Context (X):', [id2word[w] for w in x if w != 0], '-> Target (Y):', id2word[np.argmax(y)])
    if i == 5:  # Show only the first 5 pairs
        break
    i += 1



Sample Context-Target Pairs:
Context (X): ['learning', 'also'] -> Target (Y): deep
Context (X): ['deep', 'also', 'known'] -> Target (Y): learning
Context (X): ['deep', 'learning', 'known', 'as'] -> Target (Y): also
Context (X): ['learning', 'also', 'as', 'deep'] -> Target (Y): known
Context (X): ['also', 'known', 'deep', 'structured'] -> Target (Y): as
Context (X): ['known', 'as', 'structured', 'learning'] -> Target (Y): deep


In [13]:
# Define the CBOW model
cbow = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2),
    Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 100)            6200      
                                                                 
 lambda_1 (Lambda)           (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 62)                6262      
                                                                 
Total params: 12462 (48.68 KB)
Trainable params: 12462 (48.68 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [14]:
# Train the CBOW model
epochs = 50
for epoch in range(1, epochs + 1):
    loss = 0
    for x, y in generate_context_word_pairs(wids, window_size, vocab_size):
        loss += cbow.train_on_batch(x.reshape(1, -1), y.reshape(1, -1))
    if epoch % 10 == 0:
        print('Epoch:', epoch, '\tLoss:', loss)



Epoch: 10 	Loss: 302.17851543426514
Epoch: 20 	Loss: 250.64994937181473
Epoch: 30 	Loss: 209.60554061830044
Epoch: 40 	Loss: 177.3139235600829
Epoch: 50 	Loss: 150.38395673036575


In [15]:
# Extract the learned word embeddings
embedding_weights = cbow.get_weights()[0][1:]  # Exclude the padding index
print("\nEmbedding matrix shape:", embedding_weights.shape)

# Display word embeddings as a DataFrame
print("\nWord Embeddings (Sample):")
embedding_df = pd.DataFrame(embedding_weights, index=[id2word[i] for i in range(1, vocab_size)])
print(embedding_df.head())



Embedding matrix shape: (61, 100)

Word Embeddings (Sample):
                0         1         2         3         4         5   \
learning -0.261622  0.175840  0.327796 -0.032309  0.418981  0.061149   
deep      0.817920 -0.018986  0.653333  0.150152 -0.117192 -0.162195   
networks -0.248330  0.188581 -0.131515  0.175582  0.212774  0.380598   
neural    0.244306 -0.157685  0.272999  0.572180  0.591226 -0.453447   
as        0.119288 -0.057155  0.406259 -0.054688 -0.027695  0.588162   

                6         7         8         9   ...        90        91  \
learning  0.296201  0.049553  0.018468 -0.026031  ...  0.334597 -0.286660   
deep      0.579300  0.282415  0.114659 -0.087520  ...  0.495792  0.215134   
networks -0.004691 -0.450504  0.579308 -0.119742  ... -0.440219 -0.023278   
neural    0.641994  0.288245  0.138223  0.005732  ... -0.356352 -0.151586   
as        0.217130  0.040373 -0.159168  0.029755  ...  0.049202 -0.113313   

                92        93        94    

In [16]:
# Compute pairwise Euclidean distances between embeddings
distance_matrix = euclidean_distances(embedding_weights)
print("\nDistance matrix shape:", distance_matrix.shape)

# Find and display similar words for specific search terms
similar_words = {
    search_term: [
        id2word[idx + 1] for idx in distance_matrix[word2id[search_term] - 1].argsort()[1:6]
    ]
    for search_term in ['deep', 'unsupervised']
}
print("\nSimilar Words:", similar_words)



Distance matrix shape: (61, 61)

Similar Words: {'deep': ['with', 'representation', 'recurrent', 'convolutional', 'known'], 'unsupervised': ['semi', 'or', 'can', 'be', 'based']}
