In [1]:
# Computing Word Embeddings: Continuous Bag-of-Words

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
EMBEDDING_DIM = 10
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])


class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2 * context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)  # If we just use CrossEntropyLoss we don't have to use log_softmax
        return log_probs

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


make_context_vector(data[0][0], word_to_ix)  # example

# train
losses = []
loss_function = nn.NLLLoss()  # we need to use this since the last layer computes log_softmax.
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)  # we could use Adam optimizer too

for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        # step 1: turn context into a vector wrapped in a Variable
        context_vector = make_context_vector(context, word_to_ix)
        
        # step 2: zero out gradients since pytorch accumulates them
        model.zero_grad()
        
        # step 3: forward pass
        log_probs = model(context_vector)
        
        # step 4: compute loss
        loss = loss_function(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
        
        # step 5: backward pass and update the gradients
        loss.backward()
        optimizer.step()
        
        total_loss += loss.data
    losses.append(total_loss)
    
print(losses)

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]
[
 229.9033
[torch.FloatTensor of size 1]
, 
 228.5025
[torch.FloatTensor of size 1]
, 
 227.1134
[torch.FloatTensor of size 1]
, 
 225.7334
[torch.FloatTensor of size 1]
, 
 224.3630
[torch.FloatTensor of size 1]
, 
 223.0003
[torch.FloatTensor of size 1]
, 
 221.6440
[torch.FloatTensor of size 1]
, 
 220.2961
[torch.FloatTensor of size 1]
, 
 218.9544
[torch.FloatTensor of size 1]
, 
 217.6197
[torch.FloatTensor of size 1]
]


In [3]:
outputs = model(make_context_vector(['The', 'evolution', 'a', 'process'], word_to_ix))
_, predicted = torch.max(outputs.data, 1)
print(ix_to_word[predicted[0]])

of


In [5]:
outputs = model(make_context_vector(['conjure', 'the', 'of', 'the'], word_to_ix))
_, predicted = torch.max(outputs.data, 1)
print(ix_to_word[predicted[0]])

inhabit
