## Here we first load the glove vectors as a dictionary - `embeddings_index`
`embeddings_index['banana']` would give some 100 length vector for the word `'banana'`

The object `GLOVE_DIR` points to the text file which containes the vectors, but it could also be downloaded form http://nlp.stanford.edu/data/glove.6B.zip and saved on disk

In [1]:
import os
import numpy as np
GLOVE_DIR = '/home/datasets/glove.6B/'

print('Indexing word vectors.')
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


## Let's find the top 7 words that are closest to Sunday

In [6]:
u = embeddings_index['compute']
norm_u = np.linalg.norm(u)
similarity = []


for word in embeddings_index.keys():
    v = embeddings_index[word]
    cosine = np.dot(u, v)/norm_u/np.linalg.norm(v)
    similarity.append((word, cosine))


print(len(similarity))

400000


In [7]:
sorted(similarity, key=lambda x: x[1], reverse=True)[:10]

[('compute', 1.0),
 ('calculate', 0.72220618),
 ('algorithm', 0.64410579),
 ('computed', 0.61362344),
 ('algorithms', 0.61343831),
 ('equivalently', 0.59991407),
 ('formula_1', 0.5970425),
 ('formula_2', 0.59485167),
 ('formula_3', 0.59312898),
 ('formula_5', 0.59209329)]

## Now let's do vector algebra.

### First we subtract the vector for `italy` from `paris`. This could be imagined as a vector pointing from country to its capital. Then we add the vector of `nepal`. Let's see if it does point to the country's capital

In [15]:
output = embeddings_index['queen'] - embeddings_index['king'] + embeddings_index['man']
norm_out = np.linalg.norm(u)

similarity = []
for word in embeddings_index.keys():
    v = embeddings_index[word]
    cosine = np.dot(output, v)/norm_out/np.linalg.norm(v)
    similarity.append((word, cosine))


print(len(similarity))

400000


In [16]:
sorted(similarity, key=lambda x: x[1], reverse=True)[:7]

[('woman', 1.1351414),
 ('man', 1.1000676),
 ('girl', 1.037657),
 ('she', 0.96262932),
 ('her', 0.93075562),
 ('mother', 0.92366594),
 ('boy', 0.91029704)]

In [10]:
embeddings_index['look']

array([-0.23191001,  0.61425   ,  0.72978997, -0.56645   , -0.34542999,
        0.0035128 , -0.24593   ,  0.25248   , -0.18366   , -0.21272001,
       -0.066142  , -0.17658   ,  0.43088001,  0.065273  ,  0.1621    ,
        0.31865001,  0.017354  ,  0.54931003, -0.020546  ,  0.37718999,
        0.081528  ,  0.29773   , -0.13805   , -0.39225   ,  0.014614  ,
        0.018266  , -0.1661    , -0.83392   , -0.14606   , -0.51199001,
        0.13350999,  0.15918   , -0.21639   , -0.19966   ,  0.36950001,
        0.36482999, -0.36995   , -0.17254999,  0.21675   , -0.37445   ,
       -0.0090887 , -0.56870002, -0.2499    , -0.41793001, -0.74956   ,
        0.177     ,  0.084483  ,  0.36882001,  0.11713   , -1.07780004,
        0.31172001, -0.094204  , -0.060947  ,  1.0474    , -0.098987  ,
       -2.51880002,  0.12349   ,  0.25422001,  1.39339995, -0.027555  ,
       -0.43748999,  1.52740002, -0.53972   , -0.027559  ,  0.91613001,
       -0.01512   ,  0.46156999,  0.59706998, -0.57489997, -0.17