#Imports

In [None]:
import torch
from torch import nn

#Data and Pre-Processing:

In [13]:
#1.Corpus and Word Processing:
corpous="the quick brown fox jumped over the lazy dog"
words=corpous.split()
vocab=set(words)
word2inx={w:i for i , w in enumerate(vocab)}
indx2word={i:w for w,i in word2inx.items()}
Vocab_size=len(vocab)

In [12]:
#2.Skip-gram pairs:
def generate_pairs(text, window_size=2):
    pairs = []
    for i, word in enumerate(text):
        for j in range(max(0, i - window_size), min(len(text), i + window_size + 1)):
            if i != j:
                pairs.append((word, text[j]))
    return pairs

pairs = generate_pairs(words, window_size=2)
pairs_idx = [(word2inx[t], word2inx[c]) for t, c in pairs]

#Word2Vec Model:

In [21]:
#3.Model:
from ctypes import BigEndianStructure
class Word2Vec_Model(nn.Module):
  def __init__(self,vocab_size,embed_size):
    super().__init__()
    self.embedding=nn.Embedding(num_embeddings=vocab_size,embedding_dim=embed_size)
    self.weight=nn.Linear(in_features=embed_size,out_features=vocab_size,bias=False)

  def forward(self,x):
    x=self.embedding(x)
    logits=self.weight(x)
    return logits

model=Word2Vec_Model(vocab_size=Vocab_size,embed_size=10)

###Loss Function and Optimizer:

In [29]:
#4.Loss function and optimizer:
loss_fn=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(params=model.parameters(),
                           lr=0.02)

#Training loop:

In [34]:
epochs=500
for epoch in range(epochs):
  total_loss=0
  for target,context in pairs_idx:
    target=torch.tensor(data=target,dtype=torch.long)
    context=torch.tensor(data=context,dtype=torch.long)

    logits=model(target)
    loss=loss_fn(logits,context)
    total_loss+=loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


  if (epoch%100==0):
    print(f"Epoch: {epoch} and loss : {total_loss:.4f}")

Epoch: 0 and loss : 42.2553
Epoch: 100 and loss : 42.3063
Epoch: 200 and loss : 42.2900
Epoch: 300 and loss : 42.1870
Epoch: 400 and loss : 42.2290


#Testing Loop:

In [41]:
model.eval()
with torch.inference_mode():
  for word in vocab:
    target=torch.tensor([word2inx[word]],dtype=torch.long)
    embedding=model.embedding.weight[word2inx[word]].numpy()

    logits=model(target)
    probs=torch.softmax(logits,dim=1)
    top_idx=torch.argmax(probs,dim=1).item()
    predicted_word=indx2word[top_idx]

    print(f"Word : {word}")
    print(f"Embeddings : {probs}")
    print(f"Predicted word: {predicted_word}\n")


Word : fox
Embeddings : tensor([[0.0011, 0.2575, 0.2453, 0.2614, 0.0004, 0.2317, 0.0011, 0.0016]])
Predicted word: over

Word : jumped
Embeddings : tensor([[2.4897e-01, 3.2452e-04, 2.4295e-01, 2.5161e-01, 2.5414e-01, 4.2492e-04,
         1.4436e-04, 1.4426e-03]])
Predicted word: the

Word : brown
Embeddings : tensor([[0.2498, 0.2576, 0.0004, 0.0011, 0.2354, 0.2536, 0.0015, 0.0007]])
Predicted word: jumped

Word : over
Embeddings : tensor([[2.4319e-01, 2.5174e-01, 1.4718e-03, 2.4187e-05, 2.5265e-01, 1.7330e-03,
         2.4888e-01, 3.0818e-04]])
Predicted word: the

Word : the
Embeddings : tensor([[0.0019, 0.1700, 0.1510, 0.1698, 0.0029, 0.1509, 0.1775, 0.1761]])
Predicted word: lazy

Word : quick
Embeddings : tensor([[3.3963e-01, 7.4940e-05, 3.3603e-01, 2.1535e-03, 3.2115e-01, 1.8873e-06,
         9.2807e-04, 3.3287e-05]])
Predicted word: fox

Word : lazy
Embeddings : tensor([[1.9867e-03, 1.0247e-04, 1.8728e-03, 3.3425e-01, 3.3980e-01, 1.0723e-03,
         3.3935e-04, 3.2058e-01]])
Pre