In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import nltk
import re
import os # Import for file handling
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

**DOWNLOAD THESE NLTK RESOURCES**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# --- Model Parameters ---
TEXT_FILE_PATH = '/content/drive/MyDrive/colab datasets/LP4_datasets/cbow.txt' # <-- CHANGE THIS if your file name is different
WINDOW_SIZE = 2                # Number of context words to consider on each side
EMBEDDING_DIM = 100            # Dimension of the final word vector
EPOCHS = 50                    # Number of training epochs (Increase for better results)

# **a. Data preparation**

In [None]:
def preprocess_text(file_path):

    # 1. Read the text from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # 2. Clean and Tokenize
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers
    tokens = word_tokenize(text)

    # 3. Build Vocabulary
    vocabulary = sorted(list(set(tokens)))

    # Create mappings
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    index_to_word = {i: word for i, word in enumerate(vocabulary)}

    VOCAB_SIZE = len(vocabulary)
    print(f"Total vocabulary size: {VOCAB_SIZE} unique words.")

    return tokens, VOCAB_SIZE, word_to_index, index_to_word

In [None]:
# Load and process the data
tokens, VOCAB_SIZE, word_to_index, index_to_word = preprocess_text(TEXT_FILE_PATH)

Total vocabulary size: 92 unique words.


# **b. Generate training data (Context-Target Pairs)**

In [None]:
def generate_cbow_data(tokens, word_to_index, vocab_size, window_size):

    data = []

    for i, target_word in enumerate(tokens):
        target_index = word_to_index[target_word]
        context_indices = []

        # Collect context words within the window
        for j in range(1, window_size + 1):
            if i - j >= 0:
                context_indices.append(word_to_index[tokens[i - j]])
            if i + j < len(tokens):
                context_indices.append(word_to_index[tokens[i + j]])

        if context_indices:
            data.append((context_indices, target_index))

    # Convert the context indices into a summed one-hot vector for simplicity in Keras
    X_cbow = np.zeros((len(data), vocab_size), dtype='float32')
    Y_cbow = np.zeros((len(data), vocab_size), dtype='float32')

    for row_idx, (context_indices, target_index) in enumerate(data):
        # Create summed one-hot vector for context (X)
        for index in context_indices:
            X_cbow[row_idx, index] += 1

        # Create one-hot vector for target (Y)
        Y_cbow[row_idx, target_index] = 1

    print(f"Total training samples generated: {len(data)}")
    print(f"Final Input Shape (X): {X_cbow.shape}")
    print(f"Final Output Shape (Y): {Y_cbow.shape}")

    return X_cbow, Y_cbow

In [None]:
# Generate the data

X_cbow, Y_cbow = generate_cbow_data(tokens, word_to_index, VOCAB_SIZE, WINDOW_SIZE)

Total training samples generated: 177
Final Input Shape (X): (177, 92)
Final Output Shape (Y): (177, 92)


# c. **Create & Train the Model**

In [None]:
print("\nDefining CBOW Model Architecture...")
model = Sequential([
    # Input Layer: One-hot encoded context vector (size: VOCAB_SIZE)
    # This Dense layer is the projection layer (it learns the embeddings)
    Dense(EMBEDDING_DIM, activation='linear', input_shape=(VOCAB_SIZE,), name='Embedding_Projection'),

    # Output Layer: Predicts the target word (size: VOCAB_SIZE)
    Dense(VOCAB_SIZE, activation='softmax', name='Output_Softmax')
])


Defining CBOW Model Architecture...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.01),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

In [None]:
# Training the model
print(f"\nStarting CBOW model training for {EPOCHS} epochs...")
model.fit(
    X_cbow, Y_cbow,
    epochs=EPOCHS,
    verbose=1
)
print("CBOW model training complete.")


Starting CBOW model training for 50 epochs...
Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0197 - loss: 4.5098  
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3891 - loss: 3.6930 
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4900 - loss: 2.9448 
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5458 - loss: 2.2115
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6931 - loss: 1.5979
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.8315 - loss: 1.1115
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9308 - loss: 0.6786 
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9851 - loss: 0.4294 
Epoch 9/50
[1m

# **d. Output (Extracting and Analysing Embeddings)**

In [None]:
# The word embeddings are the weights of the 'Embedding_Projection' layer.
word_embeddings = model.get_layer('Embedding_Projection').get_weights()[0]

print(f"Extracted Embedding Matrix Shape: {word_embeddings.shape}")

Extracted Embedding Matrix Shape: (92, 100)


In [None]:
def predict_target_word(context_words, model, word_to_index, index_to_word, vocab_size):

    # 1. Convert context words to indices and then to a summed one-hot vector
    context_vector = np.zeros((1, vocab_size), dtype='float32')
    for word in context_words:
        if word in word_to_index:
            context_vector[0, word_to_index[word]] += 1
        else:
            print(f"Warning: Context word '{word}' not in vocabulary. Skipping.")

    # 2. Use the model to predict the probability distribution of the target word
    predictions = model.predict(context_vector, verbose=0)[0] # Get the first (and only) sample's predictions

    # 3. Get the index of the word with the highest probability
    predicted_index = np.argmax(predictions)

    # 4. Convert the index back to a word
    predicted_word = index_to_word[predicted_index]

    # You can also get the probability of the predicted word
    predicted_probability = predictions[predicted_index]

    return predicted_word, predicted_probability

In [None]:
# Example usage:

example_context = ['shorter', 'incubation', 'period'] # Example: Predict the word between 'making' and 'important'

predicted_word, probability = predict_target_word(
    example_context, model, word_to_index, index_to_word, VOCAB_SIZE
)

In [None]:
print(f"\nGiven the context words: {example_context}")
print(f"Predicted target word: '{predicted_word}' with probability {probability:.4f}")


Given the context words: ['shorter', 'incubation', 'period']
Predicted target word: 'median' with probability 0.8954
