In [122]:
!pip install torch torchtext
!git clone https://github.com/neubig/nn4nlp-code.git

fatal: destination path 'nn4nlp-code' already exists and is not an empty directory.


In [0]:
from collections import defaultdict
import math
import time
import random
import numpy as np
import pdb
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
K=3 #number of negative samples
N=2 #length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2)
EMB_SIZE = 128 # The size of the embedding

embeddings_location = "embeddings.txt" #the file to write the word embeddings to
labels_location = "labels.txt" #the file to write the labels to

# We reuse the data reading from the language modeling class
w2i = defaultdict(lambda: len(w2i))

#word counts for negative sampling
word_counts = defaultdict(int)

S = w2i["<s>"]
UNK = w2i["<unk>"]
def read_dataset(filename):
  with open(filename, "r") as f:
    for line in f:
      line = line.strip().split(" ")
      for word in line:
        word_counts[w2i[word]] += 1
      yield [w2i[x] for x in line]


# Read in the data
train = list(read_dataset("nn4nlp-code/data/ptb/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset("nn4nlp-code/data/ptb/valid.txt"))
i2w = {v: k for k, v in w2i.items()}
nwords = len(w2i)


# take the word counts to the 3/4, normalize
counts =  np.array([list(x) for x in word_counts.items()])[:,1]**.75
normalizing_constant = sum(counts)
word_probabilities = np.zeros(nwords)
for word_id in word_counts:
  word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant

with open(labels_location, 'w') as labels_file:
  for i in range(nwords):
    labels_file.write(i2w[i] + '\n')

In [0]:
class Word2Vec(nn.Module):
  def __init__(self, vocab_size, embed_dim, padding_idx=0):
    super(Word2Vec, self).__init__()
    
    self.vocab_size = vocab_size
    self.embed_dim = embed_dim
    self.embeddings_i = nn.Embedding(vocab_size, embed_dim, padding_idx)
    self.embeddings_o = nn.Embedding(vocab_size, embed_dim, padding_idx)
  
  def forward(self, x):
    return self.forward_i(x)
  
  def forward_i(self, x):
    return self.embeddings_i(x)
  
  def forward_o(self, x):
    return self.embeddings_o(x)

In [0]:
class SGNS(nn.Module):
  def __init__(self, word2vec, n_neg_samp=10, word_probs=None):
    super(SGNS, self).__init__()
    self.word2vec = word2vec
    self.vocab_size = self.word2vec.vocab_size
    self.n_neg_samp = n_neg_samp
    if word_probs is not None:
      wp = np.power(word_probs, 0.75)
      wp = wp / wp.sum()
      self.word_probs = torch.FloatTensor(wp)
    else:
      self.word_probs = None
    
  def forward(self, iword, owords):
    batch_size = owords.shape[0]
    context_size = 1
    if self.word_probs is not None:
      neg_words = torch.multinomial(self.word_probs, 
                                    batch_size * context_size *self.n_neg_samp, 
                                    replacement=True).view(batch_size, -1).to(device)
    else:
      neg_words = torch.FloatTensor(batch_size, context_size *self.n_neg_samp).uniform_(0, self.vocab_size - 1).long().to(device)
    ivectors = self.word2vec.forward_i(iword) # bs x emb_dim
    ovectors = self.word2vec.forward_o(owords) # bs x emb_dim
    nvectors = self.word2vec.forward_o(neg_words) # bs x neg_samp x emb_dim
    pos_score = F.logsigmoid(torch.bmm(ovectors.unsqueeze(1), ivectors.unsqueeze(2))).squeeze()
    neg_score = F.logsigmoid(-torch.bmm(nvectors, ivectors.unsqueeze(2))).sum(1).squeeze()
#     if -(pos_score + neg_score).mean() > 1000.0:
#       print(ivectors.shape, ovectors.shape, nvectors.shape)
#       print(pos_score.shape, neg_score.shape)
#       print(pos_score.mean(), neg_score.mean())
    return -(pos_score + neg_score).mean()

In [0]:
def get_next(data, bs=4):
  iwords = []
  owords = []
  for sent_id, sent in enumerate(data):
    for word_id, word in enumerate(sent):
      ow_ids = list(range(word_id)) + list(range(word_id+1, len(sent)))
      for oword_id in ow_ids:
        iwords.append(word)
        owords.append(sent[oword_id])
        if len(iwords) >= bs:
          yield torch.LongTensor(iwords).to(device), torch.LongTensor(owords).to(device)
          iwords, owords = [], []
  if len(iwords) > 0:
    yield torch.LongTensor(iwords).to(device), torch.LongTensor(owords).to(device)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [129]:
word2vec = Word2Vec(nwords, EMB_SIZE).to(device)
model = SGNS(word2vec, K, word_probabilities).to(device)
trainer = torch.optim.SGD(model.parameters(), lr=1e-2)
model

SGNS(
  (word2vec): Word2Vec(
    (embeddings_i): Embedding(10000, 128, padding_idx=0)
    (embeddings_o): Embedding(10000, 128, padding_idx=0)
  )
)

In [0]:
# for batch_id, (iword, owords) in enumerate(get_next(train, batch_size)):
#   my_loss = model(iword, owords)
#   if my_loss> 100.0:
#     print("WWW %f" % my_loss)
#     break
#   if batch_id % 10000 == 0:
#     print(my_loss)
#   if batch_id >= 1000000:
#     break

In [131]:
MAX_LEN = 100
batch_size = 64

for ITER in range(100):
  print("started iter %r" % ITER)
  # Perform training
  random.shuffle(train)
  train_words, train_loss = 0, 0.0
  start = time.time()
  for batch_id, (iword, owords) in enumerate(get_next(train, batch_size)):
    my_loss = model(iword, owords)
    train_loss += my_loss.item()
    train_words += len(iword)
    my_loss.backward()
    trainer.step()
    if (batch_id+1) % 10000 == 0:
      print(train_loss, train_words)
      print("--finished %r words. loss/word=%.4f" % (batch_id*batch_size+1, train_loss/train_words))
  print("iter %r: train loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, train_loss/train_words, math.exp(train_loss/train_words), time.time()-start))
  # Evaluate on dev set
  dev_words, dev_loss = 0, 0.0
  start = time.time()
  for batch_id, (iword, owords) in enumerate(get_next(dev, batch_size)):
    my_loss = model(iword, owords)
    dev_loss += my_loss.item()
    dev_words += len(iword)
    trainer.step()
  print("iter %r: dev loss/word=%.4f, ppl=%.4f, time=%.2fs" % (ITER, dev_loss/dev_words, math.exp(dev_loss/dev_words), time.time()-start))

  print("saving embedding files")
  with open(embeddings_location, 'w') as embeddings_file:
    W_w_np = word2vec.embeddings_i.weight.data.numpy()
    for i in range(nwords):
      ith_embedding = '\t'.join(map(str, W_w_np[i]))
      embeddings_file.write(ith_embedding + '\n')

started iter 0


KeyboardInterrupt: ignored