In [2]:
%matplotlib inline

In [1]:
# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x295b461c970>

# Exercise: Computing Word Embeddings: Continuous Bag-of-Words

The Continuous Bag-of-Words model (CBOW) is frequently used in NLP deep
learning. It is a model that tries to predict words given the context of
a few words before and a few words after the target word. This is
distinct from language modeling, since CBOW is not sequential and does
not have to be probabilistic. Typcially, CBOW is used to quickly train
word embeddings, and these embeddings are used to initialize the
embeddings of some more complicated model. Usually, this is referred to
as *pretraining embeddings*. It almost always helps performance a couple
of percent.

The CBOW model is as follows. Given a target word $w_i$ and an
$N$ context window on each side, $w_{i-1}, \dots, w_{i-N}$
and $w_{i+1}, \dots, w_{i+N}$, referring to all context words
collectively as $C$, CBOW tries to minimize

\begin{align}-\log p(w_i | C) = -\log \text{Softmax}(A(\sum_{w \in C} q_w) + b)\end{align}

where $q_w$ is the embedding for word $w$.

Implement this model in Pytorch by filling in the class below. Some
tips:

* Think about which parameters you need to define.
* Make sure you know what shape each operation expects. Use .view() if you need to
  reshape.




In [8]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}
data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))


class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # Out 1xE
        # Embed words to VxE where V is the vocab size, E is the dimension
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Out 1x128
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.act1 = nn.ReLU()

        # Out 1xV
        # set 128 in and vocab size out so that final features is the prob of each word from the vocab
        self.linear2 = nn.Linear(128, vocab_size)
        self.act2 = nn.LogSoftmax(dim=1)

    def forward(self, inputs):
        l1 = sum(self.embeddings(inputs)).view(1,-1)
        l1 = self.linear1(l1)
        l1 = self.act1(l1)

        l2 = self.linear2(l1)
        prob = self.act2(l2)
        return prob


# create your model and train.  here are some functions to help you make
# the data ready for use by your module

def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [10]:
device = torch.device("cuda")

losses = []
# choose NLLLoss we want to minize the negative log of softmax
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=5).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(100):
    total_loss=0
    for context, target in data:
        # create vector representaion of the context words (by its index from vocab)
        context_vector = make_context_vector(context, word_to_ix).to(device)

        log_probs = model(context_vector)

        target_vector = torch.tensor([word_to_ix[target]], dtype=torch.long).to(device)

        loss = loss_function(log_probs,target_vector)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    losses.append(total_loss)


In [3]:


losses = []
# choose NLLLoss we want to minize the negative log of softmax
loss_function = nn.NLLLoss()
model = CBOW(vocab_size, embedding_dim=5)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

for epoch in range(100):
    total_loss=0
    for context, target in data:
        # create vector representaion of the context words (by its index from vocab)
        context_vector = make_context_vector(context, word_to_ix)

        log_probs = model(context_vector)

        target_vector = torch.tensor([word_to_ix[target]], dtype=torch.long)

        loss = loss_function(log_probs,target_vector)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    losses.append(total_loss)

In [10]:
#TESTING

ix_to_word = {ix:word for ix, word in enumerate(vocab)}
context = ['As', 'they' ,'processes', 'manipulate']
context_vector = make_context_vector(context, word_to_ix)
a = model(context_vector)

#Print result
print(f'Raw text: {" ".join(raw_text)}\n')
print(f'Context: {context}\n')
print(f'Prediction: {ix_to_word[torch.argmax(a[0]).item()]}')

Raw text: We are about to study the idea of a computational process. Computational processes are abstract beings that inhabit computers. As they evolve, processes manipulate other abstract things called data. The evolution of a process is directed by a pattern of rules called a program. People create programs to direct processes. In effect, we conjure the spirits of the computer with our spells.

Context: ['As', 'they', 'processes', 'manipulate']

Prediction: evolve,
