# setup

In [1]:
import os
import argparse
import numpy as np

In [2]:
VECTOR_FILE = os.getenv("VECTOR_FILE") 
print(f"VECTOR_FILE: {VECTOR_FILE}")

VECTOR_FILE: /veld/input/vectors.txt


In [3]:
VECTORS = {}
with open(VECTOR_FILE, 'r') as f:
    for line in f:
        vals = line.rstrip().split(' ')
        VECTORS[vals[0]] = np.array([float(x) for x in vals[1:]])

# functions

In [4]:
def get_cosine_similarity_of_vectors(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [5]:
def get_cosine_similarity_of_words(w1, w2):
    v1 = VECTORS[w1.lower()]
    v2 = VECTORS[w2.lower()]
    return get_cosine_similarity_of_vectors(v1, v2)

In [6]:
def get_nearest_words_of_vector(v1, limit_results=None):
    comparisons = []
    for w2, v2 in VECTORS.items():
        comparisons.append((w2, get_cosine_similarity_of_vectors(v1, v2)))
    comparisons = sorted(comparisons, key=lambda x: -x[1])        
    if limit_results is not None:
        comparisons = comparisons[:limit_results]
    return comparisons

In [7]:
def get_nearest_words_of_word(w1, limit_results=None):
    v1 = VECTORS[w1]
    return get_nearest_words_of_vector(v1, limit_results)

# testing

In [8]:
v1 = VECTORS["woman"]
print(v1.shape)
print(v1)
v2 = VECTORS["man"]
print(v2.shape)
print(v2)

(50,)
[ 0.096015 -0.096682  0.712446  0.328539 -0.23651  -1.474321 -0.029442
 -0.736665  0.526664 -1.273229 -0.027323  0.416276  1.191343  1.071498
 -0.918529 -0.007542  0.428583  0.364168  0.348376  0.197446  0.327013
 -0.586403 -0.037338  0.548331 -0.634218 -0.214358  0.394453  0.674985
 -0.388423 -0.711256  0.958919  0.243313 -0.596865 -0.49274   0.251286
  0.684247 -0.152321 -0.499066  1.069465 -0.074243 -0.232348  0.317385
 -0.290931 -0.135448 -0.179189  0.948896 -1.538548  0.943505  0.117741
 -0.413323]
(50,)
[ 0.413997 -0.715977  0.882065 -0.086934  0.269493 -1.020816 -0.566689
 -1.021524  0.783288 -1.380319 -0.223243  0.39901   0.834728  1.061541
  0.219635 -0.015897  0.771733  0.199281  0.382025 -0.251229  0.712457
  0.370199  0.00492   0.594028 -0.425458 -0.981287  0.31461   0.491601
  0.165573 -0.796892  0.628136  0.356457 -0.055861 -1.028748  0.083855
  0.303463  0.215678 -0.672988  1.109875  0.79904  -0.387502  0.298688
  0.474208 -0.055073  0.162977  0.730034 -1.715664  0

In [9]:
print(get_cosine_similarity_of_words("woman", "man"))

0.7952947101180879


In [10]:
get_nearest_words_of_word("woman", limit_results=20)

[('woman', 1.0000000000000002),
 ('man', 0.7952947101180879),
 ('child', 0.7806146936214137),
 ('young', 0.7562922416266591),
 ('person', 0.7267350203722399),
 ('children', 0.7239497013266021),
 ('girl', 0.7176226225869569),
 ('husband', 0.6984533750825099),
 ('female', 0.6947973660144121),
 ('wife', 0.6894388168312731),
 ('male', 0.6858978209260197),
 ('mother', 0.6734433603918374),
 ('she', 0.6652213993854329),
 ('pregnant', 0.661924485049929),
 ('men', 0.6589109848297353),
 ('women', 0.6584881682302154),
 ('adult', 0.6573114479805181),
 ('married', 0.6474592184630725),
 ('born', 0.6442165423027911),
 ('living', 0.6317649533348679)]

In [11]:
get_nearest_words_of_word("man", limit_results=20)

[('man', 1.0),
 ('woman', 0.7952947101180879),
 ('my', 0.73484674315297),
 ('person', 0.7345752142068436),
 ('men', 0.7330896669750585),
 ('she', 0.727424508917398),
 ('love', 0.726554415712222),
 ('said', 0.7231162337200449),
 ('life', 0.7171124319835173),
 ('young', 0.7041745343157707),
 ('who', 0.6958270692565538),
 ('thing', 0.6841281755375672),
 ('like', 0.6809043909571346),
 ('father', 0.6751276156290783),
 ('girl', 0.6716541252654835),
 ('spirit', 0.6704746742563483),
 ('alone', 0.6686527012562634),
 ('another', 0.6661819937580753),
 ('he', 0.6649867068908999),
 ('good', 0.6647117318539704)]