### A very simple implementation of skip-gram in pytorch

see https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb



In [None]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.nn.functional as F

In [None]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]

In [None]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]   # split sentence-wise
    return tokens

tokenized_corpus = tokenize_corpus(corpus)
tokenized_corpus

In [None]:
# build the vocabulary (the word types of the model/from the corpus)

vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

# each word gets an index position
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}  
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

print(vocabulary,vocabulary_size)

In [None]:
# look two words to the left and to the right
window_size = 2

# create pairs: (center word,context word): task = predict context word
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence] # map words to there indices wrt. vocabulary
    # for each word threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w   # index of the context word
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]  # get the word index wrt. vocabulary
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array
print(idx_pairs)

# [[ 0  1]         means: 0 of vocab  and 1 of vocab = he and is 
#  [ 0  2] .....]  he and a ....

In [None]:
# a one-hot vector: only the index of the current word is set to 1, rest is 0

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0 
    return x

In [None]:
embedding_dims = 5

W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)

#initrange = 0.5
#W1.weight.data.uniform_(-initrange, initrange)

num_epochs = 20
learning_rate = 0.001

i=0
for epo in range(num_epochs):
    loss_val = 0
    
    for data, target in idx_pairs: # data is center word position
        x = Variable(get_input_layer(data)).float()  # get one-hot vector of center word
        y_true = Variable(torch.from_numpy(np.array([target])).long()) # determine the context word index

        z1 = torch.matmul(W1, x)   # access embeding vector of center word
        z2 = torch.matmul(W2, z1)  # get weighted output for all context words
     
    
        # turn weighted output into probabilities
        log_softmax = F.log_softmax(z2,  dim=0)  # remove log and the sum is 1
       
        # loss is just the -neg log likelihood value of the real (ie. seen) context word (via its position)
        loss = F.nll_loss(log_softmax.view(1,-1), y_true,reduction='mean')  # view produces a tensor with a single array
        # y_true is the index position at which we find the loss given the log_softmax array   
    
        if i== -1:
            print(y_true,"\t= \tindex\n",loss,"\t= \tloss\n",log_softmax.view(1,-1),"\t=\toutput softmax")
            break
        i+=1

        loss_val += loss.data.item()   # just for output
        loss.backward()
    #    print("grad",W1.grad.data[1])
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data
        
        W1.grad.data.zero_()
        W2.grad.data.zero_()
        
 
 #   if epo % 10 == 0:    
 #       print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')        
#print(W2)

### explanation of output (set i==1)

 tensor([2]) 	= 	index
 
 tensor(4.8778, grad_fn=<NllLossBackward>) 	= 	loss is -val at index position index (2, first line)
 
 tensor([[ -4.6247,  -2.6985,  ******-4.8778*****,  -4.9672,  -6.2686,  -0.9419, -11.5144,
          -8.4150,  -3.2777,  -6.4923,  -2.1065,  -8.1870,  -1.2819,  -2.5667,
          -7.0781]], grad_fn=<ViewBackward>) 	=	output softmax
    
 note: the values are different at each run, but the mapping is the same

In [None]:
# compare the words for similarity via the dot product

def similarity(v,u):
  return torch.dot(v,u)/(torch.norm(v)*torch.norm(u))

s1=similarity(W2[word2idx["she"]], W2[word2idx["king"]]) 

s2=similarity(W2[word2idx["she"]], W2[word2idx["queen"]]) 

s1,s2

# (tensor(-0.2058, grad_fn=<DivBackward0>),
# tensor(0.4774, grad_fn=<DivBackward0>))
#
# ie. she is more similar to queen than to king

# note: the output is not stable, the example is too much a toy example