In [32]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

In [59]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
with open("CBOW.txt", "r", encoding='utf-8') as file:
    text = file.read().lower()

# Tokenize and clean
tokens = nltk.word_tokenize(text)
tokens = [t for t in tokens if t.isalpha() and t not in stop_words]

# Create word-index mappings
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
vocab_size = len(word2idx) + 1
print(f"Vocabulary size: {vocab_size}")


Vocabulary size: 59


[nltk_data] Downloading package punkt to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
# --- 2. Generate CBOW Training Data ---
def generate_cbow_data(tokens, window_size):
    data = []
    for i in range(window_size, len(tokens) - window_size):
        context = []
        for j in range(-window_size, window_size + 1):
            if j != 0:
                context.append(word2idx[tokens[i + j]])
        target = word2idx[tokens[i]]
        data.append((context, target))
    return data

window_size = 2
data = generate_cbow_data(tokens, window_size)
contexts = [x[0] for x in data]
targets = [x[1] for x in data]

# Pad context sequences
X = pad_sequences(contexts, maxlen=2 * window_size, padding='pre')
y = to_categorical(targets, num_classes=vocab_size)
print(f"Training samples: {X.shape[0]}")


Training samples: 82


In [61]:
# --- 3. Build CBOW Model ---
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=2 * window_size))
# Average the embeddings of context words
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embedding_dim,)))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# --- 4. Train the Model ---
model.fit(X, y, epochs=100, batch_size=50,verbose=1)


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.0229 - loss: 4.0777 
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.0607 - loss: 4.0682
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.1161 - loss: 4.0600
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.2115 - loss: 4.0507
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.3166 - loss: 4.0417
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.3787 - loss: 4.0330
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.4379 - loss: 4.0246
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5081 - loss: 4.0166
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x27fb3d22860>

In [62]:
# --- 5. Extract Embeddings ---
embeddings = model.layers[0].get_weights()[0]

print("\nSample word embeddings (first 10 words):")
for word in list(word2idx.keys())[:10]:
    print(f"{word}: {embeddings[word2idx[word]][:10]}")


Sample word embeddings (first 10 words):
influenza: [-0.31305823 -0.02625087  0.1182193  -0.18986319  0.0053038  -0.0590611
  0.03090453 -0.22457905 -0.2101159   0.17547247]
virus: [ 0.54733944  0.05526996  0.00867955 -0.26509932  0.20122446  0.13350864
 -0.27953848 -0.06544478 -0.36372963  0.15217884]
transmission: [-0.02443286 -0.14780769 -0.06342787 -0.14713286 -0.37339124  0.20719957
 -0.09729418 -0.05777493  0.3807827   0.1578947 ]
serial: [ 0.14128967 -0.17456798 -0.2041568  -0.06218733  0.13785246  0.36052638
 -0.13735369  0.05098824 -0.29141843  0.19746736]
interval: [ 0.14205581 -0.32855064  0.17584787 -0.24024373  0.2317098   0.31202477
 -0.06205055  0.08338565 -0.26755238  0.2111705 ]
days: [ 0.16996595 -0.05607214  0.21448295 -0.00472284 -0.21462332  0.4181928
 -0.19531159 -0.01286016 -0.22011971  0.40495557]
viruses: [-0.07269853  0.20421495 -0.2552545   0.08393963 -0.08203755  0.17468806
 -0.12803672 -0.32205185  0.15438023  0.05163897]
shorter: [ 0.2117149   0.09534776 

In [63]:
# --- 6. Predict Target Word from Context ---
def predict_target_word(context_words, tokenizer, model, window_size=2):
    # Convert words to indices
    context_indices = [tokenizer.word_index.get(w, 0) for w in context_words if w in tokenizer.word_index]
    # Pad context
    X_test = pad_sequences([context_indices], maxlen=2 * window_size, padding='pre')
    # Predict
    pred_probs = model.predict(X_test, verbose=0)
    target_idx = np.argmax(pred_probs)
    return idx2word.get(target_idx, None)



In [64]:
# Example prediction
example_context = ['hand', 'hygiene', 'and', 'wearing']
predicted_word = predict_target_word(example_context, tokenizer, model, window_size)
print(f"\nPredicted target word for context {example_context}: {predicted_word}")



Predicted target word for context ['hand', 'hygiene', 'and', 'wearing']: driver


In [65]:

test_contexts = [
    ['shorter', 'median', 'incubation', 'period'],
    ['spread', 'faster', 'than', 'covid'],
    ['hand', 'hygiene', 'and', 'wearing'],
    ['effective', 'control', 'measures', 'include']
]
test_targets = ['covid', 'influenza', 'masks', 'social']


In [66]:
correct = 0
for context, true_word in zip(test_contexts, test_targets):
    predicted = predict_target_word(context, tokenizer, model, window_size)
    print(f"Context: {context} -> Predicted: {predicted}, True: {true_word}")
    if predicted == true_word:
        correct += 1
print(f"Test Accuracy: {correct / len(test_contexts) * 100:.2f}%")


Context: ['shorter', 'median', 'incubation', 'period'] -> Predicted: median, True: covid
Context: ['spread', 'faster', 'than', 'covid'] -> Predicted: influenza, True: influenza
Context: ['hand', 'hygiene', 'and', 'wearing'] -> Predicted: driver, True: masks
Context: ['effective', 'control', 'measures', 'include'] -> Predicted: driver, True: social
Test Accuracy: 25.00%
