In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.nn.functional as F

In [67]:
class SkipGramModel(torch.nn.Module):
    def __init__(self, vocab_size, embed_size, num_sampled=64):
        super(SkipGramModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.proj_matrix = autograd.Variable(torch.FloatTensor(vocab_size, embed_size).normal_())
        self.num_sampled = num_sampled
        
    def forward(self, center_words, target_words, is_correct):
        logits = self.proj_matrix[target_words] * self.embedding(center_words)


In [68]:
vocab_size = 50000
embed_size = 128
model = SkipGramModel(vocab_size, embed_size)

In [5]:
import nltk

tokens = []
token_counter = nltk.Counter()

with open('text8') as f:
    for line in f:
        tokens += line.split()
        token_counter.update(tokens)

In [6]:
token_to_id = {token_tuple[0]: i for i, token_tuple in enumerate(token_counter.most_common(vocab_size))}

In [7]:
token_ids = []
for token in tokens:
    if token in token_to_id:
        token_ids.append(token_to_id[token])
len(token_ids)

16586825

In [8]:
import random

def generate_sample(index_words, context_window_size):
    """ Form training pairs according to the skip-gram model. """
    for index, center in enumerate(index_words):
        context = random.randint(1, context_window_size)
        # get a random target before the center word
        for target in index_words[max(0, index - context): index]:
            yield center, target
        # get a random target after the center wrod
        for target in index_words[index + 1: index + context + 1]:
            yield center, target

In [9]:
def get_batch(iterator, batch_size):
    """ Group a numerical stream into batches and yield them as Numpy arrays. """
    while True:
        center_batch = np.zeros(batch_size, dtype=np.int32)
        target_batch = np.zeros(batch_size, dtype=np.int32)
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(iterator)
        yield center_batch, target_batch

In [10]:
skip_window = 1
batch_size = 128

single_gen = generate_sample(token_ids, skip_window)
batch_gen = get_batch(single_gen, batch_size)

In [70]:
num_steps = 10000
print_skip = 1000

batch_loss = 0

loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)

for step in range(num_steps):
    np_batch = next(single_gen)
    #batch_context = autograd.Variable(torch.from_numpy(np_batch[0]).long())
    batch_context = autograd.Variable(torch.LongTensor([np_batch[0]]))
    #batch_target = autograd.Variable(torch.from_numpy(np_batch[1]).long())
    batch_target = autograd.Variable(torch.LongTensor([np_batch[1]]))
    #log_probs = model(batch_context)
    #loss = loss_function(log_probs, batch_target)
    loss = model(batch_context, torch.LongTensor([np_batch[1]]))
    batch_loss += loss.data
    if (step + 1) % print_skip == 0:
        print('Step {}; loss: {}'.format(step + 1, batch_loss / print_skip))
        batch_loss = 0
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Step 1000; loss: 
 778.4898
[torch.FloatTensor of size 1]



KeyboardInterrupt: 

In [None]:
nn.LSTM()