<a href="https://colab.research.google.com/github/yucong-guo/cs224/blob/main/lec1_2%20word2vec/skip_gram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn

In [2]:
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

In [3]:
# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
n_vocab = len(vocab)

word_to_ix = {word:ix for ix, word in enumerate(vocab)}
ix_to_word = {ix:word for ix, word in enumerate(vocab)}

data = []
for i in range(2, len(raw_text) - 2):
    context = raw_text[i]
    target = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    data.append((context, target))
data[:3]

[('about', ['We', 'are', 'to', 'study']),
 ('to', ['are', 'about', 'study', 'the']),
 ('study', ['about', 'to', 'the', 'idea'])]

In [26]:
def make_target_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    return torch.tensor(idxs, dtype=torch.long)

In [48]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
n_embed = 100
class SkipGram(nn.Module):
    def __init__(self, n_vocab, n_embed):
        super().__init__()

        self.embed = nn.Embedding(n_vocab, n_embed)
        self.output = nn.Linear(n_embed, n_vocab)
        self.log_softmax = nn.LogSoftmax(dim = 1)

    def forward(self, x):
        '''
        x: size: (1,)
        '''
        x = self.embed(x) #(1,100)
        scores = self.output(x) #(1,n_embed)
        log_ps = self.log_softmax(scores) #(1,n_embed)

        return log_ps

In [49]:

model = SkipGram(n_vocab, n_embed)

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

#TRAINING
for epoch in range(50):
    total_loss = 0

    for context, target in data:
        #context_vector = make_context_vector(context, word_to_ix)
        target_vector = make_target_vector(target, word_to_ix) #(1,4)

        log_probs = model(torch.tensor([word_to_ix[context]]))

        for target in target_vector:

          total_loss += loss_function(log_probs, torch.tensor((target,), dtype=torch.long))

    #optimize at the end of each epoch
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

In [67]:
context = 'People'

a = model(torch.tensor([word_to_ix[context]]))
#torch.topk(a.flatten(), 3).indices
#Print result
print(f'Context: {context}\n')
top_4_index = (torch.topk(a[0],3).indices)
prediction = [ix_to_word[int(ix)] for ix in top_4_index]
print(f'Prediction: {prediction}')

Context: People

Prediction: ['a', 'create', 'program.']
