In [None]:
import cPickle as pickle
import msgpack

import numpy as np

In [None]:
# Load vocabulary w/ word frequencies
with open('wmt11.head.vocab', 'rb') as f:
    vocab = msgpack.load(f)

In [None]:
# Load requisite vector data
with open('wmt11.head.vectors', 'rb') as f:
    W = pickle.load(f)

In [None]:
id2word = dict((id, word) for word, (id, _) in vocab.iteritems())

In [None]:
# Normalize word vectors
for i, row in enumerate(W):
    W[i, :] /= np.linalg.norm(row)
    
# Remove context word vectors
W = W[:len(vocab), :]

In [None]:
def most_similar(positive, negative, topn=10, freq_threshold=5):
    # Build a "mean" vector for the given positive and negative terms
    mean_vecs = []
    for word in positive: mean_vecs.append(W[vocab[word][0]])
    for word in negative: mean_vecs.append(-1 * W[vocab[word][0]])
    
    mean = np.array(mean_vecs).mean(axis=0)
    mean /= np.linalg.norm(mean)
    
    # Now calculate cosine distances between this mean vector and all others
    dists = np.dot(W, mean)
    
    best = np.argsort(dists)[::-1][:topn + len(positive) + len(negative) + 100]
    result = [(id2word[i], dists[i]) for i in best if (vocab[id2word[i]] >= freq_threshold
                                                       and id2word[i] not in positive
                                                       and id2word[i] not in negative)]
    return result[:topn]

In [None]:
most_similar(['king', 'woman'], ['man'], topn=50)

In [None]:
most_similar(['brought', 'seek'], ['bring'], topn=50)

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import logging

import numpy as np
from numpy.testing import assert_allclose

import evaluate
import glove


# Mock corpus (shamelessly stolen from Gensim word2vec tests)
test_corpus = ("""human interface computer
survey user computer system response time
eps user interface system
system human system eps
user response time
trees
graph trees
graph minors trees
graph minors survey
I like graph and stuff
I like trees and stuff
Sometimes I build a graph
Sometimes I build trees""").split("\n")

glove.logger.setLevel(logging.ERROR)
vocab = glove.build_vocab(test_corpus)
cooccur = glove.build_cooccur(vocab, test_corpus, window_size=10)
id2word = evaluate.make_id2word(vocab)

W = glove.train_glove(vocab, cooccur, vector_size=10, iterations=500)

# Merge and normalize word vectors
W = evaluate.merge_main_context(W)


def test_similarity():
    similar = evaluate.most_similar(W, vocab, id2word, 'graph')
    logging.debug(similar)


In [2]:
W

array([[  1.76863036e-01,   1.06420739e-01,  -5.68388346e-02,
         -3.25742154e-01,   5.66774998e-01,   8.34974421e-02,
          5.12050915e-01,  -1.45726657e-01,   1.37232722e-01,
          4.66459590e-01],
       [  1.86942051e-02,  -1.32604399e-01,   1.46012603e-01,
          2.10063053e-01,  -3.67849356e-02,   5.00128860e-01,
          2.31580088e-01,   2.64782724e-01,   9.09439805e-02,
          7.30156414e-01],
       [ -3.21640051e-01,   4.21456363e-01,   2.62546959e-01,
          4.48856056e-01,  -1.37258049e-01,  -5.14317641e-01,
         -1.62838295e-01,  -1.43768005e-01,   9.42472382e-02,
         -3.30286312e-01],
       [  4.46461637e-01,  -2.46975138e-02,   1.93156120e-01,
          1.05366072e-01,  -9.74688845e-02,   3.16066382e-01,
         -1.59454129e-01,   5.11101031e-01,  -1.61036659e-01,
          5.74168890e-01],
       [  1.45025910e-01,  -3.38898352e-01,   3.98213183e-01,
          4.96835855e-02,  -4.47507024e-01,   4.22373595e-02,
         -3.69258732e-01