# CBOW
Continous Bag Of Words(CBOW) is a word embedding technique under the Word2Vec method\
Here we use a shallow neural netword to create our word embeddings\
We feed in the surrounding words to the NN to predict the target word.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(CBOW, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, context):
        context_embedding = self.embeddings(context.unsqueeze(0)).sum(dim=1)
        res = self.linear(context_embedding)
        return res

In [14]:
window_size = 2
doc = [
    "i am henry",
    "i like college",
    "do henry like college",
    "i am do i like college",
    "i do like henry",
    "do i like henry",
]
raw_text = " ".join(doc)
tokens = raw_text.split()
vocab = set(tokens)
vocab_size = len(vocab)

In [None]:
data = []
word_index = {word: i for i, word in enumerate(vocab)}

for i in range(2, len(tokens) - 2):
    context = [
        word_index[word]
        for word in tokens[i - window_size : i] + tokens[i + 1 : i + window_size + 1]
    ]
    target_word = word_index[tokens[i]]
    data.append((torch.tensor(context), torch.tensor(target_word)))

print(data)

[(tensor([1, 0, 1, 3]), tensor(2)), (tensor([0, 2, 3, 5]), tensor(1)), (tensor([2, 1, 5, 4]), tensor(3)), (tensor([1, 3, 4, 2]), tensor(5)), (tensor([3, 5, 2, 3]), tensor(4)), (tensor([5, 4, 3, 5]), tensor(2)), (tensor([4, 2, 5, 1]), tensor(3)), (tensor([2, 3, 1, 0]), tensor(5)), (tensor([3, 5, 0, 4]), tensor(1)), (tensor([5, 1, 4, 1]), tensor(0)), (tensor([1, 0, 1, 3]), tensor(4)), (tensor([0, 4, 3, 5]), tensor(1)), (tensor([4, 1, 5, 1]), tensor(3)), (tensor([1, 3, 1, 4]), tensor(5)), (tensor([3, 5, 4, 3]), tensor(1)), (tensor([5, 1, 3, 2]), tensor(4)), (tensor([1, 4, 2, 4]), tensor(3)), (tensor([4, 3, 4, 1]), tensor(2)), (tensor([3, 2, 1, 3]), tensor(4)), (tensor([2, 4, 3, 2]), tensor(1))]


In [16]:
embed_size = 10
learning_rate = 0.01
epochs = 1000

model = CBOW(vocab_size, embed_size)
lossfn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        optimizer.zero_grad()
        output = model(context)
        loss = lossfn(output, target.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(epoch, total_loss)

0 43.41829997301102
50 10.840636879205704
100 7.696862827986479
150 6.366553973406553
200 5.675506556406617
250 5.270045618060976
300 5.008826375938952
350 4.827488952549174
400 4.693799571483396
450 4.590332630323246
500 4.507075492525473
550 4.437947267317213
600 4.379104012215976
650 4.327999471192015
700 4.28289133211365
750 4.242555095930584
800 4.20609219041944
850 4.1728397329716245
900 4.142292355696554
950 4.11404877804307


In [18]:
word_to_lookup = "henry"
wi = word_index[word_to_lookup]
embedding = model.embeddings(torch.tensor([wi]))
print(f"Embedding for '{word_to_lookup}': {embedding.detach().numpy()}")

Embedding for 'henry': [[-0.8072041   0.8175361  -0.68911165 -0.32924142 -0.23396243  0.68101513
  -2.9420059  -0.21836732  0.3715383  -0.33744484]]
