In [12]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Lambda, Dense, Input
from tensorflow.keras.models import Model, Sequential
import tensorflow.keras.backend as K
from sklearn.metrics.pairwise import euclidean_distances
import re

In [13]:
data = """
Climate change refers to significant, long-term changes in the global climate.
It is primarily driven by human activities, such as burning fossil fuels, deforestation, and industrial processes,
which increase the levels of greenhouse gases in the atmosphere. These gases trap heat, leading to a warming effect known as global warming.
Consequences of climate change include more frequent and severe weather events, rising sea levels, and impacts on ecosystems and biodiversity.
Efforts to address climate change focus on reducing emissions, transitioning to renewable energy, and enhancing adaptation strategies.
"""

# Preprocessing: lowercasing and removing punctuation
climate_data = re.sub(r'[^\w\s]', '', data.lower()).split()

In [14]:
tokenizer = tf.keras.layers.TextVectorization(split='whitespace')
tokenizer.adapt(climate_data)
vocab = tokenizer.get_vocabulary()
word2id = {word: index for index, word in enumerate(vocab)}
id2word = {index: word for word, index in word2id.items()}
vocab_size = len(vocab)

In [15]:
print('Vocabulary Size:', vocab_size)
print('Sample Vocabulary:', list(word2id.items())[:10])

Vocabulary Size: 67
Sample Vocabulary: [('', 0), ('[UNK]', 1), ('and', 2), ('to', 3), ('climate', 4), ('the', 5), ('change', 6), ('warming', 7), ('on', 8), ('of', 9)]


In [16]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for i in range(window_size, len(corpus) - window_size):
        context = corpus[i - window_size:i] + corpus[i + 1:i + window_size + 1]
        target = corpus[i]
        x = pad_sequences([context], maxlen=context_length, padding='post')
        y = to_categorical([target], vocab_size)
        yield x, y

In [17]:
corpus_ids = [word2id[word] for word in climate_data]

In [18]:
embed_size = 100  # Embedding vector size
window_size = 2

cbow = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size * 2),
    Lambda(lambda x: K.mean(x, axis=1)),
    Dense(vocab_size, activation='softmax')
])

cbow.compile(optimizer='adam', loss='categorical_crossentropy')
cbow.summary()



In [19]:
epochs = 5
for epoch in range(epochs):
    loss = 0
    for x, y in generate_context_word_pairs(corpus=corpus_ids, window_size=window_size, vocab_size=vocab_size):
        loss += cbow.train_on_batch(x, y)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss}")



Epoch 1/5, Loss: 349.1693768501282
Epoch 2/5, Loss: 347.8436198234558
Epoch 3/5, Loss: 345.6324110031128
Epoch 4/5, Loss: 343.17742824554443
Epoch 5/5, Loss: 340.34699869155884


In [20]:
weights = cbow.get_layer('embedding').get_weights()[0]
print("Embedding Matrix Shape:", weights.shape)

Embedding Matrix Shape: (67, 100)


In [21]:
embeddings_df = pd.DataFrame(weights, index=[id2word[i] for i in range(vocab_size)])
print(embeddings_df.head())

               0         1         2         3         4         5         6   \
         0.034449  0.049132 -0.021081 -0.018232 -0.023004  0.037950  0.017549   
[UNK]    0.042593 -0.016125 -0.007169 -0.027973 -0.009408 -0.020664 -0.028043   
and      0.131469 -0.040384  0.006151 -0.015815 -0.072943 -0.089331  0.038925   
to      -0.044693 -0.061725  0.005863  0.055234 -0.004956 -0.034783  0.041645   
climate -0.072283 -0.019292 -0.034226  0.114746 -0.051277  0.050019 -0.050134   

               7         8         9   ...        90        91        92  \
         0.014841  0.000600  0.049012  ... -0.029408 -0.034956  0.011795   
[UNK]   -0.029421  0.009347 -0.008649  ... -0.014640 -0.003474  0.025219   
and      0.027119 -0.021295  0.009354  ... -0.048958  0.086148 -0.053945   
to      -0.007385 -0.005657  0.068682  ...  0.009357 -0.055786  0.062661   
climate -0.091215 -0.019682 -0.015777  ...  0.056733 -0.003422 -0.001876   

               93        94        95        96        9

In [22]:
distance_matrix = euclidean_distances(weights)


In [23]:
def get_similar_words(search_term, top_n=5):
    term_id = word2id[search_term]
    distances = distance_matrix[term_id]
    closest_ids = distances.argsort()[1:top_n + 1]
    similar_words = [id2word[idx] for idx in closest_ids]
    return similar_words

print("Similar words to 'climate':", get_similar_words('climate'))

Similar words to 'climate': ['change', 'significant', '[UNK]', 'focus', 'it']
