In [3]:
import numpy as np

First, reading the data from the pre-trained GloVe Word Embeddings into <code>words</code> and <code>vecs</code>.

In [4]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = []
        vecs = []
        for line in f:
            line = line.strip().split()
            words.append(line[0])
            vecs.append(np.array(line[1:], dtype=np.float64))
    return words, vecs

In [5]:
words, vecs = read_glove_vecs('glove.6B.50d.txt')

Sorting <code>words</code> and <code>vecs</code> using <code>key = words</code> in order to use binary search later on.

In [6]:
wordvec = zip(words, vecs)
wordvec = sorted(wordvec, key=lambda x:x[0])

In [7]:
wordvec[0]

('!', array([-0.58402 ,  0.39031 ,  0.65282 , -0.3403  ,  0.19493 , -0.83489 ,
         0.11929 , -0.57291 , -0.56844 ,  0.72989 , -0.56975 ,  0.53436 ,
        -0.38034 ,  0.22471 ,  0.98031 , -0.2966  ,  0.126   ,  0.55222 ,
        -0.62737 , -0.082242, -0.085359,  0.31515 ,  0.96077 ,  0.31986 ,
         0.87878 , -1.5189  , -1.7831  ,  0.35639 ,  0.9674  , -1.5497  ,
         2.335   ,  0.8494  , -1.2371  ,  1.0623  , -1.4267  , -0.49056 ,
         0.85465 , -1.2878  ,  0.60204 , -0.35963 ,  0.28586 , -0.052162,
        -0.50818 , -0.63459 ,  0.33889 ,  0.28416 , -0.2034  , -1.2338  ,
         0.46715 ,  0.78858 ]))

In [8]:
words = [i[0] for i in wordvec ]
vecs = [i[1] for i in wordvec]

In [9]:
len(words), len(vecs)

(400000, 400000)

Converting <code>vecs</code> to numpy array for better access and usage.

In [10]:
vecs = np.array(vecs)
vecs.shape

(400000, 50)

Function for Binary Search in <code>list</code> of <code>words</code>

In [11]:
def search(wrd, arr):
    low = 0
    high = len(arr)-1
    while low <= high:
        mid = (low + high)//2
        if arr[mid]==wrd:
            return mid
        if arr[mid] < wrd:
            low = mid+1
        if arr[mid] > wrd:
            high = mid-1
    return -1

In [12]:
search('mango', words)

233026

Similarity function returning similarity of a single vector with all word vectors.

In [13]:
def similarity(u, v):
    num = np.dot(u, v.T).reshape((u.shape[0], 1))
    den = (np.linalg.norm(u, axis=1)*np.linalg.norm(v, axis=1)).reshape((u.shape[0], 1))
    sim = num/den
    return sim.ravel()

In [15]:
similarity(vecs, np.ones((1,50))).shape

(400000,)

Out of the similarity values mask out the indices with similarity > 0.7 and return those words as <code>(word, similarity_value)</code> pairs

In [16]:
def find_similar(word, words, vecs):
    idx = search(word, words)
    if idx==-1:
        return "Word not in vocabulary!"
    vec1 = vecs[idx].reshape((1,50))
    sims = similarity(vecs, vec1)
    simidx = np.where(sims > 0.7)[0]
    simwords = []
    for i in simidx:
        simwords.append((words[i], sims[i]))
    return simwords

Sort the resulting similar words by descending order of similarity and return top 5 similar words.

In [17]:
def filter_results(simwords):
    if simwords=="Word not in vocabulary!":
        return simwords
    simwords = sorted(simwords, key = lambda x:x[1], reverse=True)
    return list(np.array(simwords)[:5,0])

In [18]:
filter_results(find_similar('india', words, vecs))

['india', 'indian', 'pakistan', 'malaysia', 'bangladesh']

Save <code>vecs</code> into a <i>.npy</i> file.

In [19]:
np.save('vecs.npy', vecs)

Save the <code>words</code> into a binary file.

In [21]:
from pickle import dump
with open('words.dat', 'wb') as f:
    dump(words, f)