In [16]:
# import
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

import numpy as np
from sklearn.metrics.pairwise import pairwise_distances

In [17]:
# load word vectors
word2vec = {}
embedding = []
idx2word = []

with open("embeddings/glove.6B.50d.txt") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype=float)
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print("Loaded %s words" % len(word2vec))
embedding = np.array(embedding)
V,  D = embedding.shape

Loaded 400000 words


In [18]:
# distance definitions
def dist1(a, b):
    # euclidian distance
    return np.linalg.norm(a - b)

def dist2(a, b):
    # cosine distance
    return (1 - a.dot(b)) / (np.linalg.norm(a) * np.linalg.norm(b))

# pick a distance type
dist, metric = dist2, "cosine"

In [19]:
# analogy functions
def find_analogies1(w1, w2, w3):
    # make sure all words are in vocabulary
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in vocabulary" % w)
            return

    d1 = word2vec[w1]
    d2 = word2vec[w2]
    d3 = word2vec[w3]
    v0 = d1 - d2 + d3
    
    min_dist = np.infty
    best_word = ""
    for w, v in iteritems(word2vec):
        if w not in (w1, w2, w3):
            d = dist(v, v0)
            if d < min_dist:
                min_dist = d
                best_word = w
    print("%s - %s = %s - %s" % (w1, w2, w3, best_word))
      
# a more efficient one without for loop
def find_analogies2(w1, w2, w3):
    # make sure all words are in vocabulary
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("%s not in vocabulary" % w)
            return

    d1 = word2vec[w1]
    d2 = word2vec[w2]
    d3 = word2vec[w3]
    v0 = d1 - d2 + d3
    
    distances = pairwise_distances(v0.reshape(1,D), embedding, metric=metric).reshape(V)
    idx = distances.argmin()
    best_word = idx2word[idx]
    
    print("%s - %s = %s - %s" % (w1, w2, w3, best_word))
    
# pick a function
find_analogies = find_analogies2

In [26]:
# function to find nearest neighbour of a given word
def nearest_neighbors(w, n=5):
    if w not in word2vec:
        print("%s not in vocabulary" % w)
        return
    
    v = word2vec[w]
    distances = pairwise_distances(v.reshape(1,D), embedding, metric=metric).reshape(V)
    idxs = distances.argsort()[1:n+1]
    print("neighbors of: %s" % w)
    for idx in idxs:
        print("\t%s" % idx2word[idx])

In [27]:
find_analogies("king", "man", "woman")

king - man = woman - king


In [28]:
find_analogies("france", "paris", "rome")

france - paris = rome - italy


In [29]:
nearest_neighbors("january")

neighbors of : january
	december
	february
	october
	september
	november


In [30]:
nearest_neighbors("king")

neighbors of : king
	prince
	queen
	ii
	emperor
	son


In [31]:
nearest_neighbors("einstein")

neighbors of : einstein
	relativity
	bohr
	physics
	heisenberg
	freud


In [32]:
nearest_neighbors("rome")

neighbors of : rome
	naples
	venice
	italy
	turin
	pope
