#### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1)

In [13]:
# a.Import libraries

import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical


# Download tokenizer (only first time)
nltk.download('punkt')

# Load text file
with open("CBOW.txt", "r") as file:
    text = file.read().lower()

# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Tokenize text into words
tokens = word_tokenize(text)

print("Total tokens:", len(tokens))
print("Sample tokens:", tokens[:15])

# Create a vocabulary
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# Create mapping of word to index
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}


window_size = 2  # context window
data = []

# Create (context, target) pairs
for i in range(window_size, len(tokens) - window_size):
    context = [tokens[i - 2], tokens[i - 1], tokens[i + 1], tokens[i + 2]]
    target = tokens[i]
    data.append((context, target))

print("Total training pairs:", len(data))
print("Example pair:", data[100])


# Convert words to one-hot vectors
def one_hot_encode(word):
    vector = np.zeros(vocab_size)
    vector[word2idx[word]] = 1
    return vector

# Prepare training data
X = []  # inputs (contexts)
Y = []  # outputs (target)

for context, target in data:
    context_vec = np.sum([one_hot_encode(w) for w in context], axis=0)
    X.append(context_vec)
    Y.append(one_hot_encode(target))

X = np.array(X)
Y = np.array(Y)

print("Input shape:", X.shape)
print("Output shape:", Y.shape)


model = Sequential()
model.add(Dense(32, input_dim=vocab_size, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

history = model.fit(X, Y, epochs=100, verbose=1)


# Pick a random context to test
import random
idx = random.randint(0, len(data) - 1)
test_context, actual_target = data[idx]

context_vec = np.sum([one_hot_encode(w) for w in test_context], axis=0)
pred = model.predict(context_vec.reshape(1, -1))
pred_word = idx2word[np.argmax(pred)]

print(f"\nRandom example index: {idx}")
print("Context words:", test_context)
print("Actual target word:", actual_target)
print("Predicted target word:", pred_word)


[nltk_data] Downloading package punkt to C:\Users\Gayatri
[nltk_data]     Tagalpallewa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total tokens: 177
Sample tokens: ['the', 'speed', 'of', 'transmission', 'is', 'an', 'important', 'point', 'of', 'difference', 'between', 'the', 'two', 'viruses', 'influenza']
Vocabulary size: 92
Total training pairs: 173
Example pair: (['in', 'contrast', 'we', 'are'], 'while')
Input shape: (173, 92)
Output shape: (173, 92)
Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.0280 - loss: 4.5005
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.0864 - loss: 4.2595
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1295 - loss: 3.9572
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1786 - loss: 3.5946
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1340 - loss: 3.3475
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - ac

In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   --------------------------- ------------ 1.0/1.5 MB 5.0 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.2 MB/s  0:00:00
Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl (277 kB)
Downloading click-8.3.0-py3-none-any.whl (107 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk

   ---------------------------------------- 0/4 [tqdm]
   ---------------------------------------- 0/4 [tqdm]
   -----------------------------------

In [3]:
# a.Import libraries

import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical



### a. Data preparation

In [7]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # new in NLTK ≥3.8
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# Download tokenizer (only first time)
nltk.download('punkt')

# Load text file
with open("CBOW.txt", "r") as file:
    text = file.read().lower()

# Remove special characters and digits
text = re.sub(r'[^a-zA-Z\s]', '', text)

# Tokenize text into words
tokens = word_tokenize(text)

print("Total tokens:", len(tokens))
print("Sample tokens:", tokens[:15])

# Create a vocabulary
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print("Vocabulary size:", vocab_size)

# Create mapping of word to index
word2idx = {word: i for i, word in enumerate(vocab)}
idx2word = {i: word for word, i in word2idx.items()}


Total tokens: 177
Sample tokens: ['the', 'speed', 'of', 'transmission', 'is', 'an', 'important', 'point', 'of', 'difference', 'between', 'the', 'two', 'viruses', 'influenza']
Vocabulary size: 92


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gaura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### b. Generate Training Data

In [9]:
window_size = 2  # context window
data = []

# Create (context, target) pairs
for i in range(window_size, len(tokens) - window_size):
    context = [tokens[i - 2], tokens[i - 1], tokens[i + 1], tokens[i + 2]]
    target = tokens[i]
    data.append((context, target))

print("Total training pairs:", len(data))
print("Example pair:", data[100])


Total training pairs: 173
Example pair: (['in', 'contrast', 'we', 'are'], 'while')


In [10]:
# Convert words to one-hot vectors
def one_hot_encode(word):
    vector = np.zeros(vocab_size)
    vector[word2idx[word]] = 1
    return vector

# Prepare training data
X = []  # inputs (contexts)
Y = []  # outputs (target)

for context, target in data:
    context_vec = np.sum([one_hot_encode(w) for w in context], axis=0)
    X.append(context_vec)
    Y.append(one_hot_encode(target))

X = np.array(X)
Y = np.array(Y)

print("Input shape:", X.shape)
print("Output shape:", Y.shape)


Input shape: (173, 92)
Output shape: (173, 92)


### c. Train Model

In [5]:
model = Sequential()
model.add(Dense(32, input_dim=vocab_size, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

history = model.fit(X, Y, epochs=100, verbose=1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.0053 - loss: 4.5400   
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1000 - loss: 4.2934
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1548 - loss: 4.0003
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1193 - loss: 3.7172
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2047 - loss: 3.2640
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.2655 - loss: 2.9716
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3231 - loss: 2.7233
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4981 - loss: 2.4205
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

### d. Output

In [12]:
# Pick a random context to test
import random
idx = random.randint(0, len(data) - 1)
test_context, actual_target = data[idx]

context_vec = np.sum([one_hot_encode(w) for w in test_context], axis=0)
pred = model.predict(context_vec.reshape(1, -1))
pred_word = idx2word[np.argmax(pred)]

print(f"\nRandom example index: {idx}")
print("Context words:", test_context)
print("Actual target word:", actual_target)
print("Predicted target word:", pred_word)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step

Random example index: 117
Context words: ['to', 'symptom', 'at', 'present']
Actual target word: onset
Predicted target word: onset
