In [1]:
from gensim.models import word2vec

In [3]:

# 参考資料: https://bit.ly/3DAGC4r


# input: 単語リストのリスト
sample_sents = [['this', 'is', 'a', 'first', 'sentence', '.'],
                ['this', 'is', 'a', 'second', 'sentence', '.']]

# modelに学習結果を格納する
# docs: https://bit.ly/3ctEVd9
model = word2vec.Word2Vec(sentences=sample_sents,
                          vector_size=100, window=5, min_count=1)


In [4]:
# ある単語の分散表現を得る
model.wv['this']

array([-0.00713902,  0.00124103, -0.00717672, -0.00224462,  0.0037193 ,
        0.00583312,  0.00119818,  0.00210273, -0.00411039,  0.00722533,
       -0.00630704,  0.00464721, -0.00821997,  0.00203647, -0.00497705,
       -0.00424769, -0.00310899,  0.00565521,  0.0057984 , -0.00497465,
        0.00077333, -0.00849578,  0.00780981,  0.00925729, -0.00274233,
        0.00080022,  0.00074665,  0.00547788, -0.00860608,  0.00058445,
        0.00686942,  0.00223159,  0.00112468, -0.00932216,  0.00848237,
       -0.00626413, -0.00299237,  0.00349379, -0.00077263,  0.00141129,
        0.00178199, -0.0068289 , -0.00972481,  0.00904058,  0.00619805,
       -0.00691293,  0.00340348,  0.00020606,  0.00475374, -0.00711994,
        0.00402695,  0.00434743,  0.00995737, -0.00447374, -0.00138927,
       -0.00731732, -0.00969783, -0.00908026, -0.00102276, -0.00650329,
        0.00484973, -0.00616403,  0.00251919,  0.00073944, -0.00339216,
       -0.00097922,  0.00997912,  0.00914589, -0.00446183,  0.00

In [7]:
# ある単語の類似している単語を上位top N件取得する
N = 5
model.wv.most_similar('this', topn=N)

[('a', 0.17018888890743256),
 ('second', 0.13887983560562134),
 ('first', 0.03476494178175926),
 ('sentence', 0.004503031261265278),
 ('.', -0.02775035984814167)]

In [10]:
# ２つの単語のコサイン類似度を計算
w1, w2 = 'this', 'is'
model.wv.similarity(w1, w2)

-0.04461707

In [11]:
# 単語ベクトルの足し引き
# 王 - 男 + 女 = 女王 的なことができる
model.wv.most_similar(positive=['this', 'first'], negative=['second'])

# 王の例:
# most_similar(positive=['king', 'woman'], negative=['man']) のように書ける．


[('a', 0.15189601480960846),
 ('.', 0.029083924368023872),
 ('sentence', -0.07171175628900528),
 ('is', -0.08107341825962067)]

In [12]:
# 分散表現の保存
# 出力するファイルの一行目：単語数と分散表現の次元
# 二行目以降：一行一単語
model.wv.save_word2vec_format('word2vec.txt')


# 容量削減のためにbinaryで保存・公開することが多い
# model.wv.save_word2vec_format('word2vec.bin', binary=True)

"""
7 100
. -0.00053622725 0.00023643016 0.0051033497 0.009009273 -0.0093029495 -0.007116809 0.0064588715 0.008972988 -0.005015428 -0.003763373 0.0073805046 -0.0015334726 -0.0045366143 0.0065540504 -0.0048601604 -0.0018160177 0.0028765798 0.0009918738 -0.008285215 -0.009448819 0.007311766 0.005070262 0.0067576934 0.00076286553 0.0063508893 -0.0034053659 -0.00094640255 0.0057685734 -0.0075216386 -0.003936105 -0.007511582 -0.00093004224 0.009538119 -0.007319167 -0.0023337698 -0.0019377422 0.008077435 -0.005930896 4.5161247e-05 -0.004753735 -0.009603551 0.005007293 -0.008759587 -0.0043918253 -3.5099984e-05 -0.00029618264 -0.00766124 0.009614741 0.0049820566 0.009233143 -0.008157918 0.004495797 -0.0041370774 0.0008245349 0.008498618 -0.004462178 0.0045175003 -0.0067869616 -0.0035484887 0.009398508 -0.0015776539 0.00032137157 -0.00414063 -0.007682688 -0.0015080094 0.0024697948 -0.0008880281 0.0055336617 -0.002742977 0.0022600652 0.0054557943 0.008345952 -0.0014537406 -0.009208143 0.004370551 0.00057178497 0.0074419067 -0.0008132839 -0.0026384138 -0.008753009 -0.0008565569 0.002826562 0.005401428 0.0070526553 -0.005703123 0.0018588186 0.006088862 -0.0047980524 -0.0031072616 0.0067976285 0.0016314745 0.00018991709 0.0034736372 0.0002177763 0.009618826 0.005060604 -0.008917391 -0.0070415614 0.0009014559 0.006392534
sentence -0.0086196875 0.003665738 0.0051898835 0.005741937 0.007466917 -0.0061676763 0.0011056137 0.0060472824 -0.0028400517 -0.0061735227 -0.000410223 -0.00836895 -0.005600014 0.0071045374 0.0033525396 0.0072256685 0.0068002464 0.007530742 -0.0037891555 -0.00056180713 0.0023483753 -0.0045190332 0.008388732 -0.009858165 0.006764641 0.0029144168 -0.004932833 0.004398186 -0.0017395759 0.006711383 0.00996485 -0.004362445 -0.000599339 -0.0056956387 0.0038508223 0.0027866268 0.0068910765 0.0061010956 0.009538496 0.009273417 0.007898068 -0.006989505 -0.009155865 -0.0003557539 -0.003099842 0.007894316 0.005938573 -0.0015456629 0.0015109634 0.0017900396 0.007817571 -0.009510188 -0.00020553112 0.0034691954 -0.00093897345 0.008381772 0.009010782 0.006536505 -0.00071162224 0.007710404 -0.008534336 0.0032071066 -0.004637997 -0.0050889566 0.0035896183 0.005370338 0.007769513 -0.0057665063 0.0074333595 0.006625495 -0.0037098003 -0.008745641 0.0054374672 0.006509755 -0.0007875514 -0.006709857 -0.0070859264 -0.0024970602 0.0051432536 -0.0036652375 -0.00937006 0.0038267397 0.004884479 -0.0064285635 0.0012085581 -0.0020748782 2.4402141e-05 -0.009883509 0.0026920033 -0.0047501065 0.0010876465 -0.0015762257 0.002196672 -0.007881577 -0.002717185 0.0026631975 0.005346682 -0.0023915148 -0.009510095 0.0045058774
a 9.456396e-05 0.0030773187 -0.0068126465 -0.0013754654 0.007668581 0.0073464084 -0.0036732983 0.0026427007 -0.00831713 0.0062054847 -0.0046373224 -0.003164108 0.009311357 0.0008733845 0.0074907015 -0.006074064 0.005160506 0.009922821 -0.0084573915 -0.0051356913 -0.0070648384 -0.0048626517 -0.003778565 -0.008536201 0.007955606 -0.004843938 0.008423612 0.0052625705 -0.006550026 0.00395787 0.0054701497 -0.007426536 -0.0074057197 -0.0024752307 -0.008625727 -0.0015815735 -0.00040343284 0.0032996845 0.0014418793 -0.00088142155 -0.005594058 0.0017303658 -0.0008973718 0.00679369 0.003973589 0.0045294715 0.0014343048 -0.0026998566 -0.0043668128 -0.0010320758 0.0014370275 -0.00264601 -0.007073784 -0.007805307 -0.009121788 -0.0059351707 -0.0018474245 -0.0043238713 -0.0064606713 -0.0037173224 0.004289157 -0.0037390448 0.008378174 0.0015339922 -0.0072423196 0.0094337985 0.007631211 0.005493281 -0.0068488456 0.0058226776 0.0040090918 0.005185368 0.0042559 0.0019397545 -0.0031701636 0.008353843 0.009612178 0.003792603 -0.002836995 7.126331e-06 0.0012188172 -0.008458326 -0.008223946 -0.00023101569 0.0012372875 -0.005743382 -0.004725275 -0.0073460746 0.008328614 0.00012129784 -0.0045093987 0.005701704 0.009180014 -0.004099872 0.00796468 0.005375433 0.005879123 0.0005125904 0.008213084 -0.0070190406
is -0.008242679 0.009299355 -0.00019766092 -0.0019672776 0.004603629 -0.004095316 0.002743113 0.0069399667 0.006065426 -0.0075107957 0.00938235 0.0046718074 0.003966119 -0.0062435055 0.008459978 -0.002150166 0.008825188 -0.0053620026 -0.008129421 0.0068245577 0.0016711927 -0.00219851 0.0095136 0.009493854 -0.009774048 0.0025052286 0.0061566923 0.0038724565 0.002022786 0.0004305005 0.0006736302 -0.0038206363 -0.0071402504 -0.0020888734 0.0039238976 0.008818683 0.00925915 -0.005975938 -0.009402671 0.009764376 0.0034297847 0.0051661157 0.006282344 -0.0028042626 0.0073227026 0.0028302716 0.0028710032 -0.002380371 -0.0031282497 -0.0023701428 0.0042764354 7.605791e-05 -0.009584279 -0.009665544 -0.0061481954 -0.0001285696 0.001997416 0.0094319675 0.00558435 -0.0042906976 0.00027831554 0.0049643586 0.0076983096 -0.0011442233 0.0043234206 -0.0058143805 -0.00080419064 0.00810005 -0.002360065 -0.009663455 0.0057792594 -0.0039298222 -0.0012228728 0.0099805165 -0.0022563506 -0.004757066 -0.0053293873 0.006980889 -0.0057088733 0.0021136617 -0.005255661 0.006120713 0.0043573068 0.0026063537 -0.0014910841 -0.0027460647 0.008992936 0.0052157734 -0.0021625208 -0.00947031 -0.007426052 -0.0010637427 -0.00079494715 -0.0025629092 0.00968272 -0.00045852186 0.005873761 -0.0074475883 -0.002506075 -0.005549865
this -0.007139015 0.0012410306 -0.0071767163 -0.002244618 0.0037193035 0.005833123 0.0011981821 0.0021027303 -0.0041103913 0.0072253323 -0.0063070417 0.0046472144 -0.008219974 0.0020364665 -0.004977052 -0.004247689 -0.0031089855 0.0056552077 0.0057983994 -0.004974649 0.00077332975 -0.008495779 0.0078098057 0.0092572905 -0.0027423287 0.0008002233 0.0007466507 0.005477884 -0.008606079 0.0005844545 0.006869421 0.0022315932 0.0011246753 -0.009322157 0.008482366 -0.0062641287 -0.0029923748 0.0034937859 -0.00077262876 0.0014112901 0.0017819905 -0.0068288995 -0.009724813 0.009040585 0.0061980532 -0.0069129276 0.0034034825 0.00020606279 0.0047537447 -0.0071199443 0.004026954 0.0043474343 0.0099573685 -0.004473741 -0.0013892651 -0.0073173214 -0.00969783 -0.009080257 -0.0010227561 -0.006503291 0.0048497273 -0.0061640264 0.0025191857 0.0007394409 -0.0033921553 -0.0009792233 0.009979124 0.009145887 -0.0044618296 0.009083025 -0.005641763 0.005930922 -0.0030972194 0.0034317516 0.0030172253 0.006900461 -0.0023738837 0.008775037 0.007589428 -0.009547646 -0.008008211 -0.0076378966 0.0029232574 -0.0027947235 -0.0069295214 -0.008128264 0.008309178 0.0019904876 -0.009328017 -0.0047927164 0.0031367373 -0.004713206 0.005280843 -0.0042334413 0.0026417947 -0.008045687 0.006209886 0.004818888 0.0007871914 0.0030134462
second -0.008727483 0.0021301603 -0.0008735442 -0.009319088 -0.0094281435 -0.001410718 0.0044324086 0.003704071 -0.0064986944 -0.006873069 -0.0049994136 -0.0022868442 -0.0072502876 -0.009603319 -0.0027436304 -0.008362842 -0.006038876 -0.005670929 -0.0023441387 -0.0017069983 -0.0089569995 -0.0007351994 0.008152506 0.0076904297 -0.007206116 -0.0036668323 0.0031185509 -0.0095707225 0.001476438 0.006524465 0.0057464195 -0.008763063 -0.004517145 -0.008140161 4.595518e-05 0.009263632 0.0059733056 0.005067308 0.0050610616 -0.0032429171 0.009552184 -0.0073564244 -0.007270389 -0.002265389 -0.00077856064 -0.0032161046 -0.000592587 0.007488823 -0.0006975198 -0.0016249418 0.002744398 -0.008359101 0.007855804 0.008536103 -0.009584088 0.0024462652 0.009904971 -0.0076658037 -0.00696692 -0.0077365185 0.008395922 -0.0006813359 0.009144409 -0.008158221 0.0037430834 0.0026350426 0.000742712 0.002327676 -0.007469094 -0.009358375 0.0023545765 0.006148455 0.007985687 0.0057358933 -0.0007773375 0.008306164 -0.009336314 0.0034061312 0.00026675223 0.003857243 0.0073857834 -0.0067251683 0.0055844807 -0.009522226 -0.00080446 -0.008688739 -0.0050986744 0.0092892265 -0.001858263 0.0029144264 0.009071278 0.008938131 -0.008208435 -0.0030123137 0.009886606 0.0051044296 -0.0015880871 -0.0086920215 0.0029615164 -0.006675899
first 0.0081322715 -0.0044573355 -0.0010683584 0.0010063648 -0.00019111396 0.0011481761 0.0061138608 -2.0272731e-05 -0.0032459665 -0.0015107298 0.005897298 0.001514101 -0.0007242632 0.009333245 -0.0049212836 -0.00083841087 0.0091754105 0.0067494274 0.0015028549 -0.008882561 0.001148746 -0.0022882556 0.009368236 0.0012099266 0.0014900636 0.0024064088 -0.0018360066 -0.0049996353 0.0002324295 -0.0020141816 0.006600933 0.008940122 -0.00067475555 0.0029770136 -0.0061076544 0.0016993236 -0.006926234 -0.008694027 -0.005900204 -0.008956475 0.0072775935 -0.005772033 0.008276351 -0.0072435453 0.0034216738 0.009674997 -0.007785449 -0.009945059 -0.0043291473 -0.0026831317 -0.00027128935 -0.008831551 -0.0086175585 0.0028002094 -0.0082064085 -0.0090693375 -0.002340467 -0.008631809 -0.007056651 -0.008401151 -0.0003013301 -0.0045642997 0.0066271736 0.0015271592 -0.003341477 0.006108971 -0.0060132863 -0.0046561696 -0.0072075105 -0.004336581 -0.00180933 0.0064896415 -0.002770393 0.0049189664 0.0069044423 -0.0074637057 0.00456485 0.0061269784 -0.0029544758 0.0066250204 0.0061258795 -0.006443486 -0.0067645526 0.0025389576 -0.0016238189 -0.006065128 0.009499209 -0.005130148 -0.0065540983 -0.0001198864 -0.002701428 0.0004444003 -0.0035374593 -0.0004193306 -0.00070861576 0.00082282064 0.008194816 -0.0057367086 -0.0016595292 0.005571606
"""


In [None]:
# 分散表現の読み込み
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('word2vec.txt')
# model = KeyedVectors.load_word2vec_format('word2vec.bin', binary=True)