In [1]:
import numpy as np
from numpy.linalg import norm

# Read vector model
with open("wordvecs50k.vec") as f:
    model_lines = f.readlines()

N = len(model_lines)
vecs = dict()

for i in range(N):
    line = model_lines[i].strip().split()
    word = line[0]
    vec = np.array(list(map(float, line[1:])))
    #vec /= norm(vec)  # normalize
    vecs[word] = vec

In [2]:
# read old vector model
import json
with open("wordvecs77000.json") as f:
    vecs_old = json.load(f)["vectors"]
    del vecs_old["77000"]  # entry not a word vector
    
for k in vecs_old:
    vecs_old[k] = np.array(vecs_old[k])

In [14]:
# get words with most extreme values at given dimension

def get_extreme_dim_words(vecs, dim):
    sorted_vecs = sorted(vecs, key=lambda k: vecs[k][dim])
    return [(k, vecs[k][dim]) for k in sorted_vecs[:20] + sorted_vecs[-20:]]

get_extreme_dim_words(vecs, 232)

[('serait', -1.0604),
 ('tfded', -0.5198),
 ('rkd', -0.4469),
 ('roedd', -0.4267),
 ('afded', -0.4151),
 ('kotbot', -0.3762),
 ('ahh', -0.3542),
 ('cluebot', -0.3506),
 ('satyrbot', -0.3383),
 ('delanoygabsadds', -0.3207),
 ('taxdetails', -0.3109),
 ('mergeaccount', -0.2974),
 ('stboti', -0.2843),
 ('mnm', -0.2722),
 ('csdwarnbot', -0.2502),
 ('gmina', -0.2429),
 ('ahhh', -0.2416),
 ('dywedodd', -0.2383),
 ('yourpaintings', -0.2111),
 ('proded', -0.2072),
 ('angeles', 0.5804),
 ('nadu', 0.5887),
 ('monoxide', 0.5957),
 ('kyi', 0.5968),
 ('talk', 0.6037),
 ('rican', 0.6092),
 ('century', 0.6108),
 ('jiabao', 0.6132),
 ('lankan', 0.6168),
 ('lumpur', 0.6292),
 ('laden', 0.6358),
 ('aires', 0.6384),
 ('janeiro', 0.6499),
 ('sciences', 0.6661),
 ('pradesh', 0.6759),
 ('jintao', 0.6928),
 ('dioxide', 0.7159),
 ('prnewswire', 0.7249),
 ('rid', 0.7426),
 ('userreport', 0.8824)]

In [13]:
# Check 3COSADD word similarity
def cos_sim(a, b): return a.dot(b) / (norm(a) * norm(b))

def find_similar(vecs, word_orig, word_sub, word_add):
    target = vecs[word_orig] - vecs[word_sub] + vecs[word_add]

    for k in sorted(vecs, key=lambda k: cos_sim(target, vecs[k]), reverse=True)[:10]:
        print(k, cos_sim(target, vecs[k]))
        
#find_similar(vecs, "king", "man", "woman")
find_similar(vecs_old, "king", "man", "woman")

king 0.846306552010454
queen 0.7335261779290196
prince 0.70112989695682
emperor 0.6842812566496257
empress 0.676760579698921
throne 0.6742607999288218
monarch 0.669020535149873
heir 0.6616695849405633
aragon 0.6599179621530367
pharaoh 0.6531471520045711
