<a href="https://colab.research.google.com/github/zyzhang1992/NLP/blob/master/word2vec_with_subword_information_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WORD2VEC SKIPGRAM

In [0]:
'''
IMPORTS
'''

import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [0]:
torch.cuda.is_available()

True

In [0]:
'''
Defining Corpus
'''

corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',   
]

In [0]:
'''
Tokenizing corpus
'''

def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [0]:
'''
Generating Vocabulary
'''

vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [0]:
vocabulary_size

15

In [0]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [0]:
#ONE HOT ENCODING FOR INPUT

def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

In [0]:
embedding_dims = 8
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1000
learning_rate = 0.001

In [0]:
for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
       
        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)
        
        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        
        loss_val += loss.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 100 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 5.605964619772775
Loss at epo 100: 2.757629302569798
Loss at epo 200: 2.285395928791591
Loss at epo 300: 2.046659631388528
Loss at epo 400: 1.9045629620552063
Loss at epo 500: 1.8133672765323094
Loss at epo 600: 1.7528529235294887
Loss at epo 700: 1.7106595754623413
Loss at epo 800: 1.6800211565835135
Loss at epo 900: 1.657117143699101


In [0]:
idx2word

{0: 'he',
 1: 'is',
 2: 'a',
 3: 'king',
 4: 'she',
 5: 'queen',
 6: 'man',
 7: 'woman',
 8: 'warsaw',
 9: 'poland',
 10: 'capital',
 11: 'berlin',
 12: 'germany',
 13: 'paris',
 14: 'france'}

In [0]:
W2.data

tensor([[-0.4123, -0.0245,  1.8233,  1.3324,  1.2817, -0.3725,  1.6364, -0.9389],
        [-1.9488, -0.9129, -2.2870, -0.3702,  0.8873,  0.0932,  1.1944, -0.4622],
        [-0.3428, -1.5614, -0.4646, -0.9458, -0.3248, -0.5454, -0.1858, -2.3451],
        [ 0.5922,  0.0228,  0.9677,  0.6391,  0.0165, -0.2109,  1.2660,  1.0933],
        [-0.1302,  0.8932,  0.1873,  0.7610, -1.3264, -0.6507, -0.5616,  1.0430],
        [ 1.7140,  1.5295,  1.6724,  0.3624, -0.6883, -1.8532,  0.9596, -0.5718],
        [ 0.8975,  0.9169,  0.1767, -0.3585, -1.9290,  1.1728,  0.3248, -0.6751],
        [-0.3544,  0.5478, -0.5784, -0.0673, -0.9605, -0.3760, -0.3123, -0.0997],
        [-0.4670, -0.6457,  0.1371, -0.5279, -0.5776, -0.0161, -0.7660,  0.7703],
        [ 1.7737, -0.5718, -0.3229, -1.9442, -1.4522, -0.4140, -0.3360,  0.0713],
        [-0.2404, -0.7714, -0.3713,  0.2627,  0.1469,  0.1600, -0.5915,  1.5953],
        [-0.0424,  2.2461,  0.0638,  1.8172,  1.6007,  1.4632, -0.3946,  1.4902],
        [ 0.6722

In [0]:
def calculate_analogy(a,b,c,d):
  ind_a, ind_b, ind_c, ind_d = word2idx[a], word2idx[b], word2idx[c], word2idx[d]
  
  score_vector = (W2.data[ind_a] - W2.data[ind_b]) /  (W2.data[ind_c] - W2.data[ind_d])
  
  return score_vector

In [0]:
score = calculate_analogy('he', 'she', 'king', 'queen')
score

tensor([ 0.2515,  0.6091, -2.3216,  2.0647,  3.7002,  0.1694,  7.1732, -1.1903])

# WORD2VEC WITH SUBWORD INFORMATION

In [0]:
n = 2
n_gram_vocab = []

for word in vocabulary:
  word = '<' + word + '>'
  
  n_gram_bag = []
  
  for index in range(len(word)):
    if (index+n) <= len(word):
      char_string = ''
      for count in range(n):         
        char_string += (word[index+count])
      n_gram_bag.append(char_string)
  n_gram_bag.append(word)    
  n_gram_vocab.append(n_gram_bag)  

In [0]:
n_gram_vocab

[['<h', 'he', 'e>', '<he>'],
 ['<i', 'is', 's>', '<is>'],
 ['<a', 'a>', '<a>'],
 ['<k', 'ki', 'in', 'ng', 'g>', '<king>'],
 ['<s', 'sh', 'he', 'e>', '<she>'],
 ['<q', 'qu', 'ue', 'ee', 'en', 'n>', '<queen>'],
 ['<m', 'ma', 'an', 'n>', '<man>'],
 ['<w', 'wo', 'om', 'ma', 'an', 'n>', '<woman>'],
 ['<w', 'wa', 'ar', 'rs', 'sa', 'aw', 'w>', '<warsaw>'],
 ['<p', 'po', 'ol', 'la', 'an', 'nd', 'd>', '<poland>'],
 ['<c', 'ca', 'ap', 'pi', 'it', 'ta', 'al', 'l>', '<capital>'],
 ['<b', 'be', 'er', 'rl', 'li', 'in', 'n>', '<berlin>'],
 ['<g', 'ge', 'er', 'rm', 'ma', 'an', 'ny', 'y>', '<germany>'],
 ['<p', 'pa', 'ar', 'ri', 'is', 's>', '<paris>'],
 ['<f', 'fr', 'ra', 'an', 'nc', 'ce', 'e>', '<france>']]

In [0]:
char_seq_vocab = []

for word in n_gram_vocab:
  
  for char_seq in word:
    if char_seq not in char_seq_vocab:
      char_seq_vocab.append(char_seq)

In [0]:
char_seq_vocab_size = len(char_seq_vocab)

In [0]:
char_seq2idx = {w: idx for (idx, w) in enumerate(char_seq_vocab)}
idx2char_seq = {idx: w for (idx, w) in enumerate(char_seq_vocab)}

In [0]:
#ONE HOT ENCODING FOR INPUT

def get_input_layer_char_seq(char_seq_idx):
    x = torch.zeros(char_seq_vocab_size).float()
    x[char_seq_idx] = 1.0
    return x

In [0]:
embedding_dims = 8
W1 = Variable(torch.randn(embedding_dims, char_seq_vocab_size).float(), requires_grad=True)
W2 = Variable(torch.randn(char_seq_vocab_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1000
learning_rate = 0.001

In [0]:
for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        Z1 = 0.0
        for i in range(len(n_gram_vocab[data])):
          char_seq_vec = Variable(get_input_layer_char_seq(i)).float()
          Z1 += torch.matmul(W1, char_seq_vec)
        #x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())
       
        #z1 = torch.matmul(W1, x)
        Z2 = torch.matmul(W2, Z1)
    
        log_softmax = F.log_softmax(Z2, dim=0)
        
        loss_2 = F.nll_loss(log_softmax.view(1,-1), y_true)
        
        loss_val += loss_2.item()
        loss_2.backward(retain_graph=True)
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 100 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

Loss at epo 0: 12.607089247022357
Loss at epo 100: 2.8115211997713363
Loss at epo 200: 2.3070071220397947
Loss at epo 300: 2.079699775150844
Loss at epo 400: 1.9827991894313268
Loss at epo 500: 1.9414680276598248
Loss at epo 600: 1.9159873689923967
Loss at epo 700: 1.8986946037837438
Loss at epo 800: 1.8863659926823206
Loss at epo 900: 1.8772497790200369


In [0]:
W2.data[:5]

tensor([[ 0.4758,  0.0456,  0.4351,  1.5379,  0.0725, -0.0205,  1.5503, -0.7806],
        [ 2.2069, -0.4227,  0.4275, -0.2044, -0.2550,  1.9400,  0.3378, -1.2663],
        [ 2.4431,  0.0764,  2.4822, -1.0189, -1.1074, -1.7477,  0.1908, -0.4770],
        [ 0.8356,  1.3642,  1.3098,  1.6873,  0.2735, -0.8131,  0.3572,  0.1301],
        [ 2.1187, -0.1521,  1.0664,  1.4335,  3.2855, -1.0207,  0.1699,  0.1405]])

In [0]:
def calculate_word_vector(word):
  index = word2idx[word]
  for i in range(len(n_gram_vocab[index])):
    word_vector = 0.0
    char_seq_ind = char_seq2idx[n_gram_vocab[index][i]]
    word_vector += W2.data[char_seq_ind]
    
    return word_vector

In [0]:
def calculate_analogy_2(a,b,c,d):
  
  score_vector = (calculate_word_vector(a) - calculate_word_vector(b)) /  (calculate_word_vector(c) - calculate_word_vector(d))
  return score_vector

In [0]:
score = calculate_analogy_2('he', 'she', 'king', 'queen')
score

tensor([-2.0503, -2.8577, -0.0325,  2.3783, -0.1267,  0.8438,  7.8896,  1.1950])