In [1]:
import os
import gensim


path = os.path.join('word2vec',
        'GoogleNews-vectors-negative300.bin')
model = gensim.models.KeyedVectors.load_word2vec_format(
        path,
        binary=True
    )

In [2]:
import math
import numpy
from operator import itemgetter
from numpy.linalg import norm

EPSILON = 1e-6

def euclidean(vec1, vec2):
  diff = vec1 - vec2
  return math.sqrt(diff.dot(diff))

def cosine_sim(vec1, vec2):
  vec1 += EPSILON * numpy.ones(len(vec1))
  vec2 += EPSILON * numpy.ones(len(vec1))
  return vec1.dot(vec2)/(norm(vec1)*norm(vec2))

def assign_ranks(item_dict):
  ranked_dict = {}
  sorted_list = [(key, val) for (key, val) in sorted(item_dict.items(),
                                                     key=itemgetter(1),
                                                     reverse=True)]
  for i, (key, val) in enumerate(sorted_list):
    same_val_indices = []
    for j, (key2, val2) in enumerate(sorted_list):
      if val2 == val:
        same_val_indices.append(j+1)
    if len(same_val_indices) == 1:
      ranked_dict[key] = i+1
    else:
      ranked_dict[key] = 1.*sum(same_val_indices)/len(same_val_indices)
  return ranked_dict

def correlation(dict1, dict2):
  avg1 = 1.*sum([val for key, val in dict1.iteritems()])/len(dict1)
  avg2 = 1.*sum([val for key, val in dict2.iteritems()])/len(dict2)
  numr, den1, den2 = (0., 0., 0.)
  for val1, val2 in zip(dict1.itervalues(), dict2.itervalues()):
    numr += (val1 - avg1) * (val2 - avg2)
    den1 += (val1 - avg1) ** 2
    den2 += (val2 - avg2) ** 2
  return numr / math.sqrt(den1 * den2)

def spearmans_rho(ranked_dict1, ranked_dict2):
  assert len(ranked_dict1) == len(ranked_dict2)
  if len(ranked_dict1) == 0 or len(ranked_dict2) == 0:
    return 0.
  x_avg = 1.*sum([val for val in ranked_dict1.values()])/len(ranked_dict1)
  y_avg = 1.*sum([val for val in ranked_dict2.values()])/len(ranked_dict2)
  num, d_x, d_y = (0., 0., 0.)
  for key in ranked_dict1.keys():
    xi = ranked_dict1[key]
    yi = ranked_dict2[key]
    num += (xi-x_avg)*(yi-y_avg)
    d_x += (xi-x_avg)**2
    d_y += (yi-y_avg)**2
  return num/(math.sqrt(d_x*d_y))

In [3]:
vec1=model['test'].tolist()
vec2 =model['exam'].tolist()
cosine_sim(vec1,vec2)

0.5065850456386709

In [4]:
data = model.vectors
print(data.shape)

(3000000, 300)


In [42]:
import numpy as np
num_points = 100000
selected_index = np.random.choice(data.shape[0],size=num_points, replace=True)
data = data[selected_index,:]
words = np.array(model.index2word)[selected_index]

In [43]:
import umap
fit = umap.UMAP(n_neighbors=15, random_state=42,n_components=30, metric='cosine')
%time u = fit.fit(data)

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File ".venv/lib/python3.6/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^

  state.func_ir.loc))
  n_components


CPU times: user 8min 28s, sys: 12.8 s, total: 8min 41s
Wall time: 5min 19s


In [44]:
%time  u.transform([ model['test']])

The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File ".venv/lib/python3.6/site-packages/umap/nndescent.py", line 124:
    @numba.njit(parallel=True)
    def init_from_random(n_neighbors, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File ".venv/lib/python3.6/site-packages/umap/nndescent.py", line 135:
    @numba.njit(parallel=True)
    def init_from_tree(tree, data, query_points, heap, rng_state):
    ^

  state.func_ir.loc))


CPU times: user 1.52 s, sys: 0 ns, total: 1.52 s
Wall time: 1.51 s


array([[ 6.3946724 , -0.30041966,  0.14776045,  0.32056022, -0.08450462,
        -0.9038257 , -0.43019935, -0.9653172 ,  0.21375048, -0.7295887 ,
         0.40557215,  0.18605426, -0.65948   ,  0.6652238 ,  0.45901376,
        -0.3479045 , -1.568384  ,  1.2294029 , -0.88471395, -0.7208823 ,
         2.1124218 , -1.3881054 , -0.38281393, -1.016794  ,  0.46783683,
        -1.3906313 , -0.02890567,  0.68667674,  0.77653843, -0.36162987]],
      dtype=float32)

In [45]:
for line in open("data/word-sim/EN-RW-STANFORD.txt"):
    line = line.strip().lower()
    word1, word2, val = line.split()
    try:
        vec1= model[word1].tolist()
        vec2 = model[word2].tolist()
       
        u_vec1 = u.transform([vec1] )[0]
        u_vec2 =  u.transform([vec2] )[0]
         
    except Exception as e:
        next
    print(word1,word2,val, cosine_sim(vec1, vec2),cosine_sim(u_vec1, u_vec2))

squishing squirt 5.88 0.371217937103479 0.9993143
undated undatable 5.83 -0.026137934255355592 0.99931425
circumvents beat 5.33 0.06424554383742188 0.56993926
circumvents ebb 3.25 0.002631993984925384 0.7105238
dispossess deprive 6.83 0.3823724922087398 0.56194013
provincialism narrow-mindedness 8.11 0.12013389284618387 0.56194013
provincialism partiality 4.50 0.3802551081133371 0.9988425
instrumentality department 3.00 0.00841556574646629 0.8858651
instrumentality utility 7.29 0.025310278168486293 0.9497739
involvement action 6.86 0.20863856809983605 0.98113894
involvement implication 4.17 0.3210843007222329 0.8720968
ecclesiastic clergyman 8.67 0.3639152267291902 0.997778
brigadier general 8.17 0.20714583810517445 0.9693018
carbonic chemical 5.17 0.44973107675164625 0.9958838
carbonic paper 5.17 0.1732526163222226 0.6288291
aspirate pronounce 5.20 0.1718900801083876 0.70210963
aspirate remove 2.50 0.19280796505738604 0.77262425
monotype machine 5.25 0.08733217481225172 0.7734215
inco

lastingly wear 3.00 0.006089335962038003 0.91496724
lastingly populate 0.43 0.09103716895024902 0.94250274
unexpected unannounced 8.00 0.3233091596008463 0.88168836
interlink intercommunicate 7.00 0.4402033797130174 0.5340334
interlink connect 9.22 0.5540525156570625 0.58073205
brained kill 1.75 0.14066978449010953 0.8530815
brained hit 3.40 0.021940798319572807 0.5139274
unicycles wheel 6.88 0.22109048299643871 0.75905347
unicycles bicycle 6.57 0.45955642180308437 0.7701854
preservers cook 5.17 0.1981367751138853 0.86219954
preservers worker 2.80 0.034475221946611775 0.86738086
autografts graft 5.33 0.3680154095079689 -0.23483904
retarding decelerate 7.88 0.28513918285707335 0.72487646
retarding stay 4.20 -0.09420506341296708 0.7649304
subfamily group 8.00 0.1955251280085922 0.82412326
encrust coat 8.44 0.24307100452660774 0.93706435
encrust decorate 6.71 0.3446564630969083 0.9781925
wingless flightless 8.80 0.45661499150075496 0.8026351
intraspecific interspecies 5.50 0.5056233279790

autofocus optical 6.71 0.39240019831540807 0.9934979
conversely interview 0.38 -0.03320795038386774 0.41623935
conversely proposition 0.14 0.16472485085446711 0.4148617
ceaseless continuous 9.33 0.5570674719587815 0.7164407
hybridise breed 6.86 0.32857795381725435 0.9847718
antitumor brain 0.71 0.2810598165927085 0.21705821
parallelism similarity 8.17 0.28652297293578943 0.59415066
sightedness sight 6.88 0.23909773551541946 0.79935145
battleships dreadnought 5.50 0.4363030631154009 0.6667391
subarctic polar 7.29 0.5812414608165154 0.8999269
subarctic overshoe 0.14 0.1533547983293686 0.9305185
sufferance self 2.29 0.125289443026627 0.9992832
uncomprehending undiscerning 6.57 0.43701180219467795 0.99781877
regretful penitent 8.78 0.2686817564691353 0.9633802
monoplanes airplane 7.62 0.4620693900783808 0.99849415
steepen change 3.83 0.20182740162050336 0.7714985
transfuse breathe 3.00 0.3019830321577407 0.9975917
transfuse pour 4.89 0.31683236898677397 0.9145099
hyperextension extension 6

interjection break 7.14 0.06912979504334031 0.5576367
interjection exclamation 8.00 0.40770244292195473 0.9996126
consequences position 0.56 0.0745289259226878 0.84883404
consequences result 9.38 0.3270429024892837 0.9696777
preschoolers child 7.83 0.44331882145091134 0.6855121
unmentionables garment 6.00 0.27140834531855623 0.9658127
subeditor editor 6.71 0.5107062986951805 0.98215055
standardize regulate 8.38 0.41253215534650817 0.57798636
standardize measure 4.60 0.18586182804892115 0.57711333
winners walloper 1.67 0.00546795342380992 0.57711345
persuasions electioneering 2.75 0.19017131450408645 0.78740346
persuasions belief 7.50 0.1796196251461136 0.9986927
conformations balance 2.33 0.08977460932319839 0.79131734
conformations curvature 4.75 0.2759398673108777 0.9761115
seriousness badness 5.80 0.29439975278338865 0.9987837
seriousness gravity 7.67 0.4041989987863065 0.7151698
metabolism organic 5.80 0.17125752279598783 0.9588316
reprints reproduce 8.44 0.2713127349486787 0.67976

commodes fixture 5.00 0.07766752856281996 0.58596164
conscripting enlist 7.67 0.35943172598659096 0.9992262
depopulate shrink 7.33 0.2637674642550109 0.9531219
directional leading 5.00 0.0115752165152462 0.94868207
disbelieving doubt 9.25 0.18927685625187562 0.996654
disbelieving incredulous 9.44 0.5982360860210418 0.99987775
hypervelocity speed 8.67 0.25386166689389655 0.90617543
interdisciplinary nonindulgent 4.25 0.03967578199543789 0.9061755
nonverbally numerical 4.00 0.21530824046020908 0.67631745
pressurise change 5.57 0.21530824046020908 0.67631745
measurements viscometry 4.40 0.3507463161924987 0.6763175
nonfunctional run-down 7.60 0.1305736615912411 0.67631763
severer intense 8.25 0.27314634696455226 0.67655057
brainless unintelligent 9.67 0.5894661745771158 0.99760747
marinate steep 7.14 -0.060956832831216265 0.93066067
freighter cargo 6.17 0.5863129701233459 0.99797374
terrorize coerce 4.80 0.4869937204021609 0.8561073
terrorize frighten 8.67 0.6453452460504638 0.84593827
pr

amounted work 2.50 -0.04883517874259273 0.8831273
vegetational growth 7.57 -0.04883517874259273 0.8831273
vegetational forest 7.71 -0.04883517874259273 0.8831274
unfavourable adverse 8.20 -0.04883517874259273 0.8831274
unfavourable discriminatory 6.33 -0.04883517874259273 0.88312745
vocalism voice 7.67 0.3830599678073007 0.9747271
vocalism system 0.67 0.010503768090247117 0.53536034
continence self-discipline 4.83 0.020124313881651777 0.53536034
immoderate excessive 7.50 0.4039387177678933 0.99913305
internships position 7.17 0.19451206696549192 0.88557875
translunar heavenly 6.67 0.19451206696549192 0.8855786
ideality quality 5.20 0.1635419698876103 0.780233
importance momentousness 6.83 0.38078162731350684 0.8188754
importance primacy 7.17 0.44844167137622853 0.8731397
jarringly move 3.86 0.053984992154449775 0.9600292
jarringly conflict 2.60 0.12202369721634332 0.8181832
affectional emotional 7.83 0.2590135858392114 0.84279615
rediscovery discovery 6.60 0.41119467647224384 0.590426


refuted disprove 9.62 0.5507577875504919 0.9990029
greenly discolor 4.20 0.1822192014193622 0.55740863
greenly emerald 9.12 0.09977690899549672 0.6180743
importances standing 4.29 0.09977690899549672 0.61807436
importances deal 3.20 0.09977690899549672 0.61807436
autoimmune carrier 5.14 0.046495608490554764 0.78625894
autoimmune exempt 4.50 0.02294402802736209 0.77940226
circumnavigations travel 8.25 0.15185975848604674 0.59555596
interrelationship psychodynamics 6.00 0.36803450177133074 0.99889463
monoatomic small 5.86 0.36803450177133074 0.9988945
monoatomic thermonuclear 4.33 0.36803450177133074 0.9988946
undefinable undefined 8.44 0.39363991852099556 0.99619704
catalogued compose 4.86 0.39363991852099556 0.99619704
catalogued classify 8.78 0.39363991852099556 0.996197
heterosexism discrimination 3.67 0.4943805634657751 0.90990543
inflicted intrude 5.29 0.20895367607794407 0.7672118
preaching evangelize 8.38 0.4633231497424216 0.9999213
preaching sermonize 8.67 0.483814685437642 0.9

syntactic plan 5.60 -0.071592878479913 0.5059741
reproducible duplicable 9.50 0.4415966328788403 0.9276313
monopolist person 4.80 0.13740296052982903 0.8979788
comportment manner 7.00 0.33893164023576927 0.99735314
roofers thatcher 6.83 0.245716099986653 0.7781095
improving relieve 4.50 0.16425577288514856 0.8089707
improving reform 7.50 0.21603274369464018 0.8311903
adjustor investigator 6.83 0.2544828657565802 0.7367404
dooming convict 5.67 0.06693934508617322 0.8466639
preadolescent young 8.50 0.3755991215130276 0.8663937
depictive representational 9.67 0.37881117988748236 0.9848631
stoical unemotional 10.00 0.46007053277153964 0.99967253
dynastic ruler 7.86 0.34644830560448586 0.8517072
hinduism religion 8.00 0.4673637438266454 0.99054396
pathfinder usher 5.67 0.09803190878258929 0.7567039
romanic italian 6.00 0.09803190878258929 0.75670403
overlying lie 5.00 0.1472945586104814 0.5495308
overlying kill 0.38 0.03833708489832268 0.60932326
refinery plant 7.60 0.5582200854540739 0.999

inheritor heiress 9.12 0.20838341791898893 0.972211
conspicuousness boldness 5.57 0.2936664137085431 0.9990478
preconceptions opinion 6.00 0.24626087813066805 0.995953
preconceptions experimenter 1.00 0.2157215542379279 0.90572304
uproariously combustion 4.83 0.12931823063435605 0.72862375
uproariously noise 7.33 0.1298164614086524 0.6674588
glistens spangle 6.83 0.39009109323828056 0.9910563
glistens brightness 8.00 0.27819253838322233 0.7560164
sexless asexual 8.50 0.5858299447521869 0.98882586
sexless unsexy 6.17 0.5199679798388707 0.9960583
spellers writer 6.80 0.17414539800109047 0.7396062
spellers primer 5.20 0.00608814475284252 0.7064279
orchestrations musical 7.14 0.5243881839020913 0.9957637
orchestrations arrangement 7.12 0.24525899255441222 0.52306944
embroiderer needleworker 9.33 0.11199745341963345 0.5230696
arousal desire 9.29 0.24606670767531624 0.7045036
arousal inflammation 5.50 0.3205863329034155 0.99967813
extending increase 7.12 0.20829973452075776 0.8664721
extendi

technology aeronautical 3.83 0.23014411706469048 0.8839834
technology science 7.50 0.4275680193009395 0.8490098
transfusing pour 5.86 0.1354101220720495 0.93872845
transfusing lend 3.71 0.1151587363065402 0.8014969
prolapse descend 6.67 0.017787757354332075 0.85160565
circularize canvass 5.43 0.015110268902584085 0.6108855
circularize poll 5.43 -0.04996148564943428 0.65481657
greenness profusion 1.25 0.27011508943180973 0.99984026
greenness ripeness 7.33 0.3893585779523773 0.98132825
formalisms philosophic 4.00 0.387508963488836 0.99862576
formalisms imitation 1.14 0.14063349667550942 0.99303544
interpenetrate spiritize 4.60 0.13427780959311458 0.99303555
worsens inflame 7.14 0.2976896366249363 0.76370335
worsens tumble 1.75 0.15153736906738127 0.7526516
pathfinders hunt 7.71 0.0773903914058017 0.67277485
demanded clamor 4.83 0.345478200222531 0.84421885
demanded cost 1.12 0.17103437981969313 0.6203952
unequivocal unambiguous 9.00 0.7517441545540192 0.9945667
unequivocal explicit 8.86 

muscularity condition 5.43 0.09430391847481392 0.748672
unspecialised generalized 8.29 0.27203726212767826 0.7099584
appearances manifestation 6.88 0.018878173329221922 0.9123072
disarranged randomize 7.50 0.07565231722650576 0.80130506
sniffers person 3.86 0.06119100517671943 0.6694167
irritatingly worsen 4.50 -0.013029432300069872 0.87640804
irritatingly fret 4.17 0.22116528129244917 0.9920541
exaction demand 7.50 0.08280232290918525 0.94792354
sailings travel 7.43 0.3784671258338042 0.80745006
sailings swan 3.50 0.10577699458093484 0.93438447
objector dissenter 8.20 0.38784928504347765 0.93631166
earmuffs covering 8.25 0.05029004286082606 0.7675374
synoptic same 6.40 0.08831867041825132 0.7966718
infolding organic 0.50 0.08831867041825132 0.7966718
smallish small 8.83 0.5006031522961893 0.8490784
digitise change 4.50 0.5006031522961893 0.8490785
receptions tea 5.00 0.1754625759192048 0.39618802
receptions greeting 8.22 0.1526013651421384 0.50314194
corpulence fleshiness 8.17 0.40285

immobilizing withhold 3.80 0.031651340533610475 0.78316796
promised declare 6.20 0.1970218646727375 0.8957623
employments state 3.75 0.11869983836383863 0.5639685
employments populace 3.33 0.18315322070221746 0.79830575
transposable exchangeable 7.17 0.15693275379533408 0.9428286
protractors drafting 5.83 0.09834533036829354 0.60383546
religiousness piety 7.50 0.49966816294792016 0.64257944
religiousness conscientiousness 4.00 0.3542907123091696 0.99623924
concerts settle 0.44 0.03238202106034021 0.72671646
concerts plan 0.50 0.10607712012545384 0.61323494
postholes hole 5.50 0.31296092618288046 0.7399404
liveable habitable 10.00 0.5643487487373537 0.86919993
besieging attack 8.22 0.2148585159583338 0.9976384
besieging distress 5.50 0.09436832227101086 0.6303956
irregardless look 0.00 0.130441651036438 0.9928371
irregardless prize 0.00 0.0656212652406403 0.93685055
attendance frequency 4.83 0.10827563899501032 0.6157105
attendance presence 8.25 0.20149451526145293 0.7970277
computer ex

devilishly cook 0.22 0.12689509042886465 0.7814678
devilishly antagonize 5.50 0.06825184792618076 0.98975855
disapproving discountenance 5.40 0.12029525362684747 0.6271425
subspaces mathematical 6.57 0.12029525362684747 0.6271424
connoting imply 7.67 0.3868400372736799 0.9984549
connoting express 5.57 0.16487845280564206 0.8921753
inheritance transfer 5.71 0.17483153932472567 0.8753358
inheritance acquisition 7.00 0.0697375916835711 0.72855884
archery sport 7.00 0.36779559669991624 0.8052603
sufficed serve 5.20 0.1306698313583624 0.7305479
belligerence hostility 8.67 0.5591278685973143 0.9990659
procreating brood 7.17 0.4347010660624506 0.8541559
gelatinous thick 8.00 0.38155476926348925 0.970977
villainous wicked 9.75 0.4782544196456637 0.9668473
harmony congruity 8.25 0.39924173309824545 0.98982143
harmony music 7.25 0.24206421833633804 0.8533177
inoffensive innocuous 8.83 0.5288178348839964 0.9992307
insurrectionist young 3.00 0.09875673946438918 0.9167688
inquisitive curious 8.40 0

In [46]:
print ('======================================================================================')
print ("%15s" % "Num Pairs", "%10s" % "Not found", "%15s" % "Word2Vec Rho",  "%15s" % "UMAPed Rho", "word2vec/umap","File name")
print ('======================================================================================')

word_sim_file="data/word-sim/EN-RW-STANFORD.txt"
word_sim_dir="data/word-sim"
manual_dict, auto_dict,reduced_dict = ({}, {},{})
not_found, total_size = (0, 0)
for i, filename in enumerate(os.listdir(word_sim_dir)):
    not_found=0
    for line in open(word_sim_dir+"/"+filename,'r'):
        line = line.strip().lower()
        word1, word2, val = line.split()
        try:
            vec1= model[word1].tolist()
            vec2 = model[word2].tolist()
            
            u_vec1 = u.transform([vec1] )[0]
            u_vec2 =  u.transform([vec2] )[0]
         
        except Exception as e:
            not_found+=1
            next 
        manual_dict[(word1, word2)] = float(val)
        auto_dict[(word1, word2)] = cosine_sim(vec1, vec2)
        reduced_dict[(word1, word2)] = cosine_sim(u_vec1, u_vec2)
        total_size += 1    
    print ( "%15s" % str(total_size), "%10s" % str(not_found), "%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)),
           "%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(reduced_dict)),"%15.4f" % spearmans_rho(assign_ranks(auto_dict), assign_ranks(reduced_dict)),
          filename)


      Num Pairs  Not found    Word2Vec Rho      UMAPed Rho word2vec/umap File name
            130          0          0.5590         -0.1030          0.2255 EN-YP-130.txt
            195          0          0.6318          0.0089          0.2285 EN-RG-65.txt
            482         12          0.4243          0.2001          0.3717 EN-MTurk-287.txt
           1481          0          0.5444          0.2054          0.4437 EN-SIMLEX-999.txt
           1625          9          0.5657          0.2081          0.4283 EN-VERB-143.txt
           1877          2          0.5054          0.1898          0.4166 EN-WS-353-REL.txt
           3911        209          0.3948          0.1469          0.4126 EN-RW-STANFORD.txt
           6911         54          0.3477          0.2550          0.4810 EN-MEN-TR-3k.txt
           7264          3          0.3450          0.2525          0.4812 EN-WS-353-ALL.txt
           7294          0          0.3425          0.2516          0.4812 EN-MC-30.txt
    