In [6]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Lambda, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

In [7]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\UTKARSH
[nltk_data]     BRAHMANKAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
with open("CBOW.txt","r",encoding='utf-8') as file:
    text=file.read().lower()

tokens=nltk.word_tokenize(text)
tokens = [t for t in tokens if t.isalpha() and t not in stop_words]

tokenizer=Tokenizer()
tokenizer.fit_on_texts(tokens)
word2idx=tokenizer.word_index
indx2word={v:k for k,v in word2idx.items()}
vocab_size=len(word2idx)+1

In [11]:
def generate_cbow_data(tokens,window_size):
    data=[]
    for i in range (window_size,len(tokens)-window_size):
        context=[]
        for j in range (-window_size,window_size+1):
            if j!=0:
                context.append(word2idx[tokens[i+j]])
        target=word2idx[tokens[i]]
        data.append((context,target))
    return data

window_size=2
data=generate_cbow_data(tokens,window_size)
contexts=[x[0] for x in data]
targets=[x[1] for x in data]

X=pad_sequences(contexts,maxlen=2* window_size,padding='pre')
y=to_categorical(targets,num_classes=vocab_size)
print(f"Training samples: {X.shape[0]}")


Training samples: 82


In [12]:
embedding_dim=100
model=Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=2*window_size))
model.add(Lambda(lambda x:K.mean(x,axis=1),output_shape=(embedding_dim,)))
model.add(Dense(vocab_size,activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [13]:
model.fit(X, y, epochs=100, batch_size=50,verbose=1)


Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.0229 - loss: 4.0774 
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0444 - loss: 4.0681
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0688 - loss: 4.0598
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1590 - loss: 4.0518
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2589 - loss: 4.0433
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.3343 - loss: 4.0344
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.3654 - loss: 4.0268
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.4904 - loss: 4.0178
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x2344b4b5780>

In [16]:
embeddings=model.layers[0].get_weights()[0]
for word in list(word2idx.keys())[:10]:
    print(f"{word}:{embeddings[word2idx[word]][:10]}")

influenza:[-0.01027951 -0.30078283 -0.15246384 -0.03800527 -0.07958699 -0.06314165
 -0.00567844 -0.10172644  0.25050893  0.17349605]
virus:[ 0.03767065 -0.00107381  0.06819395 -0.07838093  0.2625371   0.05990122
 -0.07577944  0.00734517 -0.4451578   0.16453314]
transmission:[ 0.19549832  0.14031906  0.12673391 -0.16028608  0.13717674  0.06604031
  0.26629412 -0.31679893  0.05012747  0.02016014]
serial:[ 0.13859345 -0.16365542  0.36201993 -0.16518399  0.25095743  0.05008023
  0.3840966  -0.3262652  -0.31572375  0.4027784 ]
interval:[ 0.0698977  -0.2868635   0.47709492 -0.141394    0.3624475  -0.4449574
 -0.24338785 -0.10716271 -0.07822799  0.44981706]
days:[ 0.25452623 -0.14476193  0.3112194  -0.16852649  0.13043624  0.00870123
  0.03084818 -0.18850586  0.05373946  0.26630917]
viruses:[ 0.25412437 -0.24210656  0.03196465 -0.04723653 -0.05466776  0.23759036
  0.21340668 -0.28646255  0.27838844  0.0494804 ]
shorter:[ 0.21812955 -0.14889562 -0.01437315  0.15325686  0.2030873  -0.0882653
  

In [17]:
def predict_target_word(context_words, tokenizer, model, window_size=2):
    context_indices = [tokenizer.word_index.get(w, 0) for w in context_words if w in tokenizer.word_index]
    X_test = pad_sequences([context_indices], maxlen=2 * window_size, padding='pre')
    # Predict
    pred_probs = model.predict(X_test, verbose=0)
    target_idx = np.argmax(pred_probs)
    return idx2word.get(target_idx, None)