In [2]:
import numpy as np

In [3]:
word_to_vec = dict()

with open('glove.6B.50d.txt', 'r', encoding='utf8') as f:
    for line in f:
        splits = line.split()

        word = splits[0]

        vec = splits[1:]

        word_to_vec[word] = np.array(vec, dtype=np.float)

In [3]:
def normalise(vector):
    """
    Function to normalize the word vectors
    for the computation of the cosine similarity
    """
    norm = np.linalg.norm(vector)

    return vector / norm

In [4]:
def get_wordict(word_to_vec):
    
    id_to_word = dict(enumerate(word_to_vec.keys()))  # from index to word
    word_to_id = {a: b for (b, a) in id_to_word.items()}  # from word to index

    return id_to_word, word_to_id

In [5]:
word_norm = np.array([normalise(k) for k in word_to_vec.values()])
id_to_word, word_to_id = get_wordict(word_to_vec)

In [6]:
def similarity(word_1, word_2):
    """
    Compute the cosine similatity of two words
    Parameters
    ----------
    word_1 : str
    word_2 : str

    Returns
    -------
    Cosine similarity
    """
    cos_ = word_to_vec[word_1] @ word_to_vec[word_2]
    return cos_

In [7]:
def top_similarity(word, topk=5):
    """
    Return topk similar words
    Parameters
    """
    cos_ = word_norm @ word_to_vec[word].T

    top_words = np.argsort(cos_)[::-1][:topk]

    return [id_to_word[i] for i in top_words]

In [8]:
def top_similarity(word, topk=5):
    """
    Return topk similar words
    Parameters
    """
    cos_ = word_norm @ word_to_vec[word].T

    top_words = np.argsort(cos_)[::-1][:topk]

    return [id_to_word[i] for i in top_words]

In [9]:
def cos_add(a_, a, b):
    """
    Implementation of the 3CosAdd function
    """
    try:
        a_, a, b = a_.lower(), a.lower(), b.lower()

        val = word_to_vec[a_] - word_to_vec[a] + word_to_vec[b]

        cos_ = word_norm @ val

        b_ = np.argmax(cos_)

        b_ = id_to_word[b_]

    except:  # Handling out of vocabulary words
        b_ = '<unknown>'

    return b_


# Word analogy 3cosMul
def cos_mul(a_, a, b):
    """
    Implementation of the 3CosMul function
    """
    try:
        a_, a, b = str(a_).lower(), str(a).lower(), str(b).lower()

        cos_ = word_norm @ word_to_vec[a_] - word_norm @ word_to_vec[a] + word_norm @ word_to_vec[b]

        b_ = np.argmax(cos_)

        b_ = id_to_word[b_]

    except:  # Handling out of vocabulary words

        b_ = '<unknown>'
    return b_

## Word similarity

In [10]:
top_similarity('man')

['man', 'woman', 'boy', 'another', 'old']

In [11]:
top_similarity('banana')

['banana', 'bananas', 'coconut', 'pineapple', 'mango']

In [13]:
top_similarity('plane')

['plane', 'airplane', 'flight', 'planes', 'crashed']

## Word analogy

In [14]:
print(f"king - queen + woman = {cos_add('king', 'queen', 'woman')}")
print(f"france - paris + berlin = {cos_add('france', 'paris', 'berlin')}")

king - queen + woman = man
france - paris + berlin = germany
