In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
import torch.nn as nn

In [2]:
corpus = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',  
    'boy is a young man',
    'girl is a young woman' 
]

In [3]:
def tokenize_corpus(corpus):
    tokens = [x.split() for x in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

In [4]:
vocabulary = []
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)
print("vocabulary size:", vocabulary_size)
for i in range(vocabulary_size):
    print(i, idx2word[i]) 

vocabulary size: 18
0 he
1 is
2 a
3 king
4 she
5 queen
6 man
7 woman
8 warsaw
9 poland
10 capital
11 berlin
12 germany
13 paris
14 france
15 boy
16 young
17 girl


In [5]:
window_size = 2
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

for data, target in idx_pairs:
    print(f'{idx2word[data]}({data})  {idx2word[target]}({target})')

he(0)  is(1)
he(0)  a(2)
is(1)  he(0)
is(1)  a(2)
is(1)  king(3)
a(2)  he(0)
a(2)  is(1)
a(2)  king(3)
king(3)  is(1)
king(3)  a(2)
she(4)  is(1)
she(4)  a(2)
is(1)  she(4)
is(1)  a(2)
is(1)  queen(5)
a(2)  she(4)
a(2)  is(1)
a(2)  queen(5)
queen(5)  is(1)
queen(5)  a(2)
he(0)  is(1)
he(0)  a(2)
is(1)  he(0)
is(1)  a(2)
is(1)  man(6)
a(2)  he(0)
a(2)  is(1)
a(2)  man(6)
man(6)  is(1)
man(6)  a(2)
she(4)  is(1)
she(4)  a(2)
is(1)  she(4)
is(1)  a(2)
is(1)  woman(7)
a(2)  she(4)
a(2)  is(1)
a(2)  woman(7)
woman(7)  is(1)
woman(7)  a(2)
warsaw(8)  is(1)
warsaw(8)  poland(9)
is(1)  warsaw(8)
is(1)  poland(9)
is(1)  capital(10)
poland(9)  warsaw(8)
poland(9)  is(1)
poland(9)  capital(10)
capital(10)  is(1)
capital(10)  poland(9)
berlin(11)  is(1)
berlin(11)  germany(12)
is(1)  berlin(11)
is(1)  germany(12)
is(1)  capital(10)
germany(12)  berlin(11)
germany(12)  is(1)
germany(12)  capital(10)
capital(10)  is(1)
capital(10)  germany(12)
paris(13)  is(1)
paris(13)  france(14)
is(1)  paris(13)


In [9]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x

#example
print (get_input_layer(1))

tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [13]:
## Define the NN architecture
embedding_dims = 50
#embedding_dims_1 = 100

class model1(nn.Module):
    def __init__(self):
        super(model1, self).__init__()
        self.fc1 = nn.Linear(vocabulary_size, embedding_dims)
        #self.fc2 = nn.Linear(embedding_dims, embedding_dims)
        self.fc3 = nn.Linear(embedding_dims, vocabulary_size)
        #self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = self.fc1(x)
        #x = F.relu(x)
        #x = self.fc2(x)
        #x = F.relu(x)
        x = self.fc3(x)
        x = F.log_softmax(x, dim=0)
        return x

model = model1()    
print (model)


model1(
  (fc1): Linear(in_features=18, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=18, bias=True)
)


In [11]:
# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

In [12]:
n = 300

for i in range(n):
    losses = 0
    for data, target in idx_pairs:
        optimizer.zero_grad()

        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())    
    
        y_pred = model(x)

        loss = F.nll_loss(y_pred.view(1,-1), y_true)
        losses += loss.item()
        loss.backward()

        optimizer.step()
             
    if i % 10 == 0: 
        print(f'Loss at epo {i}: {losses/len(idx_pairs)}')       
    
    

Loss at epo 0: 2.7743921644833622
Loss at epo 10: 2.285799517315261
Loss at epo 20: 2.170347728291336
Loss at epo 30: 2.086288149867739
Loss at epo 40: 2.019194165662843
Loss at epo 50: 1.9624687925893434
Loss at epo 60: 1.9143527557655258
Loss at epo 70: 1.8741928892476218
Loss at epo 80: 1.8412507717706719
Loss at epo 90: 1.8147608461428661
Loss at epo 100: 1.793970336719435
Loss at epo 110: 1.7780424873439633
Loss at epo 120: 1.7660598602830146
Loss at epo 130: 1.7571258739549287
Loss at epo 140: 1.7504638549016447
Loss at epo 150: 1.7454607955047063
Loss at epo 160: 1.7416613509460372
Loss at epo 170: 1.7387381323746272
Loss at epo 180: 1.7364587108699643
Loss at epo 190: 1.734657745580284
Loss at epo 200: 1.7332169924463545
Loss at epo 210: 1.7320508123660574
Loss at epo 220: 1.7310963759616929
Loss at epo 230: 1.730307190393915
Loss at epo 240: 1.7296482713855044
Loss at epo 250: 1.7290930814889012
Loss at epo 260: 1.728621278490339
Loss at epo 270: 1.7282170574275815
Loss at epo