In [1]:
# Install all the required dependencies for the project
!pip install pytorch-lightning==1.6.5 spacy==2.2.4

You should consider upgrading via the '/Users/vitalii.mishchenko/Documents/experiments/2302-nlp-course/venv/bin/python -m pip install --upgrade pip' command.[0m


# Build own Word2Vec model

The model could return vector representation (embeddings) for a given word.

## Minimal example

The goal is to provide the word to model and receive vector representation (embeddings).

We are using `torch.nn.Embedding` neural model that creates embedded vector representation for each word.

We are not training `torch.nn.Embedding` model in this example. In practice, it means that resulted embeddings does NOT reflect word semantic.

In [21]:
import torch

# our vocabulary: list of unique words with assigned indices
word_to_ix = {"hello": 0, "world": 1}

# initialize Embedding neural model
embeds = torch.nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings

# get word ID
lookup_tensor = torch.tensor([word_to_ix["hello"]], dtype=torch.long)

# get embeddings for the word
hello_embed = embeds(lookup_tensor)
hello_embed

tensor([[ 1.2295,  0.6809, -0.2180,  2.0122, -0.3287]],
       grad_fn=<EmbeddingBackward0>)

## Complete word embeddings training

**Overview**
The goal is to provide the word to the model and receive back word vector representation (embeddings).

This embeddings should reflect the word semantic.

Two semantically similar words should be closer to each other in vector space.

From ML perspective `embeddins` are the features that model will try to learn.

**Model training**
How are we going to train the model: we will train the model to predict the nearby word for the given word

- `torch.nn.Embedding` neural model is going to store word embeddings
- it creates embedded vector representation for each word
- we will train the model to predict the nearby word for the given word
- to train the model we will
  - create a pairs for each word - {current_word, nearby word}
  - `window_size` param defines how many nearby words we need to consider
  - train the model to predict the nearby word for the given word

That's what we will measure during ML training.

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter

# The Word2VecDataset class takes in a list of sentences and a window size as input.
# It creates a list of word pairs within the given window size for each word in each sentence.
class Word2VecDataset(Dataset):
  def __init__(self, sentences, window_size):
    self.window_size = window_size
    self.words = []
    for sentence in sentences:
      self.words.extend(sentence)
    self.word_counts = Counter(self.words)
    self.word_list = list(self.word_counts.keys())
    self.word2idx = {w:i for i,w in enumerate(self.word_list)}
    self.idx2word = {i:w for i,w in enumerate(self.word_list)}
    # creates a list of word pairs within the given window size for each word in each sentence
    # e.g. for sentence "I am here" and window size 2 it will build
    # (I, am), (I, here), (am, I), (am, here), (here, I), (here, am)
    self.data = []
    for sentence in sentences:
      for i,word in enumerate(sentence):
        current_range = range(max(0, i-self.window_size), min(len(sentence), i+self.window_size+1))
        for j in current_range:
          if i != j:
            current_word_idx = self.word2idx[word]
            nearby_word_idx = self.word2idx[sentence[j]]
            # word pair (current_word, nearby_word_inx)
            word_tuple = (current_word_idx, nearby_word_idx)
            self.data.append(word_tuple)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]


# The Word2Vec model consists of an embedding layer and a linear layer.
# The embedding layer maps each word index to a vector of size embedding_dim.
# The linear layer maps the embedded vectors back to the original vocabulary size.
class Word2Vec(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super().__init__()
    # embedding layer maps each word index to a vector of size embedding_dim
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    # linear layer maps the embedded vectors back to the original vocabulary size
    self.linear1 = nn.Linear(embedding_dim, vocab_size)

  # word_idx - word id
  # if batch_size > 1 then word_idx is list of word indexes
  def forward(self, word_idx):
    # return word embeddings
    embedded = self.embeddings(word_idx)
    # tries to predict nearby word for the given word_idx
    # return probabilities for each word in vocabulary
    output = self.linear1(embedded)
    return output


sentences = [['this', 'is', 'a', 'test'], ['another', 'test', 'sentence']]
dataset = Word2VecDataset(sentences, window_size=1)
dataloader = DataLoader(dataset, batch_size=2, shuffle=False)

model = Word2Vec(len(dataset.word_list), embedding_dim=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(100):
  running_loss = 0.0

  # data is one "word_tuple" in tensor from the Word2VecDataset
  # e.g. ("I", "am") but with indexes instead of words
  for i, data in enumerate(dataloader):
    # inputs - word_idx, or list of word_idx when batch_size > 1
    # labels - nearby_word_idx that model should predict, or list of nearby_word_idx when batch_size > 1
    inputs, labels = data
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs.view(-1, len(dataset.word_list)), labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
  print(f"Epoch {epoch+1} loss: {running_loss/len(dataset)}")

# model usage
word = 'test'
word_index = dataset.word2idx[word]
embedding = model.embeddings(torch.LongTensor([word_index]))
print(f"Embedding for '{word}': {embedding.tolist()}")

Epoch 1 loss: 0.9059527277946472
Epoch 2 loss: 0.8947277784347534
Epoch 3 loss: 0.8839037656784058
Epoch 4 loss: 0.8734484910964966
Epoch 5 loss: 0.8633340835571289
Epoch 6 loss: 0.8535361886024475
Epoch 7 loss: 0.8440334916114807
Epoch 8 loss: 0.8348073959350586
Epoch 9 loss: 0.8258415579795837
Epoch 10 loss: 0.8171213388442993
Epoch 11 loss: 0.8086339354515075
Epoch 12 loss: 0.8003679752349854
Epoch 13 loss: 0.7923131585121155
Epoch 14 loss: 0.7844602584838867
Epoch 15 loss: 0.7768010020256042
Epoch 16 loss: 0.7693277835845947
Epoch 17 loss: 0.7620337009429932
Epoch 18 loss: 0.754912543296814
Epoch 19 loss: 0.7479583621025085
Epoch 20 loss: 0.7411656260490418
Epoch 21 loss: 0.7345293521881103
Epoch 22 loss: 0.7280447363853455
Epoch 23 loss: 0.7217071652412415
Epoch 24 loss: 0.7155123770236969
Epoch 25 loss: 0.7094561934471131
Epoch 26 loss: 0.7035346508026123
Epoch 27 loss: 0.6977439880371094
Epoch 28 loss: 0.6920804262161255
Epoch 29 loss: 0.686540400981903
Epoch 30 loss: 0.68112038