## Imports

In [8]:
import torch
import torch.nn.functional as F
import sys
from pathlib import Path

## Data loading

This is small dataset with question, good choice for Word2Vec

In [2]:
data = list(open("../data/quora.txt", encoding="utf-8"))
data[50]

"What TV shows or books help you read people's body language?\n"

## Example of training and using my word2vec model

In [10]:
parent_folder = Path().resolve().parent
sys.path.append(str(parent_folder))

from word2vec import Word2VecSkipGram
from word2vec import Word2VecSkipGramModel
from word2vec import Word2VecCBOWModel
from word2vec import Word2VecCBOW

### CBOW

In [11]:
# create model CBOW
cbow = Word2VecCBOW()

# set data - this step save data, tokenize it and 
# create word_to_index and index_to_word dictionaries
cbow.set_text_before_context_pairs(data)

# set context groups and model - this step create groups
# [center_word, context_word_1, ... context_word_{window_radius * 2}]
# and set model for training
cbow.set_context_groups_and_model()

# subsampling probabilities - this step calculate probabilities
# for each word to be deleted from training. I use formula from
# paper:
# 
# P(w_i) = 1 - sqrt(t / f(w_i))
# 
# where t is threshold and f(w_i) is frequency of w_i in the dataset
cbow.subsampling_probabilities()

# negative sampling probabilities - this step calculate probabilities
# for each word to be used as negative sample. I use formula from
# paper:
# 
# P(w_i) = (f(w_i) / sum(f(w_j))) ^ (3/4) / Z
# 
# where f(w_i) is frequency of w_i in the dataset and Z is normalization constant
cbow.negative_sampling_probabilities()  

In [None]:
# Let's train the model
cbow.train_model(steps=10001, batch_size=128, negative_number=15)

  0%|          | 4/10001 [00:00<10:03, 16.58it/s]

Step 0, Loss: 11.091211318969727, learning rate: [0.001]


 10%|█         | 1003/10001 [00:51<07:48, 19.21it/s]

Step 1000, Loss: 3.510890483856201, learning rate: [0.001]


 20%|██        | 2004/10001 [01:44<06:44, 19.79it/s]

Step 2000, Loss: 3.0968456268310547, learning rate: [0.001]


 30%|███       | 3004/10001 [02:37<05:57, 19.57it/s]

Step 3000, Loss: 3.0199851989746094, learning rate: [0.001]


 40%|████      | 4004/10001 [03:30<05:26, 18.36it/s]

Step 4000, Loss: 2.773721694946289, learning rate: [0.001]


 40%|████      | 4040/10001 [03:31<05:03, 19.62it/s]

Loss is not decreasing, but its normal - if you will train 2000 steps and 10000 steps, the difference will be huge

In [None]:
_model_parameters = cbow.model.parameters()
embedding_matrix_center = next(
    _model_parameters
).detach()  # Assuming that first matrix was for central word
embedding_matrix_context = next(
    _model_parameters
).detach()  # Assuming that second matrix was for co

In [None]:
def find_nearest(words):
    word_vector = cbow.get_center_embeddings_by_words(words)
    dists = F.cosine_similarity(embedding_matrix_center, word_vector)
    index_sorted = torch.argsort(dists)
    top_k = index_sorted[-10:]
    return [(cbow.index_to_word[x], dists[x].item()) for x in top_k.numpy()]
find_nearest(["python"])

[('hadoop', 0.7932560443878174),
 ('spring', 0.7990931868553162),
 ('design', 0.8000043034553528),
 ('photoshop', 0.8032585978507996),
 ('algorithms', 0.8079955577850342),
 ('seo', 0.8176975250244141),
 ('php', 0.8207674026489258),
 ('c', 0.8241156339645386),
 ('java', 0.8498246669769287),
 ('python', 0.9999999403953552)]

### Skip-Gram

In [None]:
# create model CBOW
skipgram = Word2VecSkipGram()

# set data - this step save data, tokenize it and 
# create word_to_index and index_to_word dictionaries
skipgram.set_text_before_context_pairs(data)

# set context groups and model - this step create groups
# [center_word, context_word_1, ... context_word_{window_radius * 2}]
# and set model for training
skipgram.set_context_groups_and_model()

# subsampling probabilities - this step calculate probabilities
# for each word to be deleted from training. I use formula from
# paper:
# 
# P(w_i) = 1 - sqrt(t / f(w_i))
# 
# where t is threshold and f(w_i) is frequency of w_i in the dataset
skipgram.subsampling_probabilities()

# negative sampling probabilities - this step calculate probabilities
# for each word to be used as negative sample. I use formula from
# paper:
# 
# P(w_i) = (f(w_i) / sum(f(w_j))) ^ (3/4) / Z
# 
# where f(w_i) is frequency of w_i in the dataset and Z is normalization constant
skipgram.negative_sampling_probabilities()  

In [None]:
# Let's train the model
skipgram.train_model(steps=10001, batch_size=128, negative_number=15)

In [None]:
_model_parameters = skipgram.model.parameters()
embedding_matrix_center = next(
    _model_parameters
).detach()  # Assuming that first matrix was for central word
embedding_matrix_context = next(
    _model_parameters
).detach()  # Assuming that second matrix was for co

In [None]:
def find_nearest(words):
    word_vector = skipgram.get_center_embeddings_by_words(words)
    dists = F.cosine_similarity(embedding_matrix_center, word_vector)
    index_sorted = torch.argsort(dists)
    top_k = index_sorted[-10:]
    return [(skipgram.index_to_word[x], dists[x].item()) for x in top_k.numpy()]
find_nearest(["python"])