## word2vecのモデルにvocabを追加することの効果を調べる

In [1]:
from gensim.models.word2vec import Word2Vec

sentences = ['らく賃 担当者 誰', 'NHK 担当者 誰', 'パソコン 担当者 誰', '営業 担当者 誰', '今日 天気 晴れ', 'あなた パソコン 壊す した']
add_tokens = [t.split(' ') for t in ['ハウスコム 担当 誰', '雨 降る 都市', 'シータ 担当 誰']]
tokens =[t.split(' ') for t in sentences]
model = Word2Vec(tokens, min_count=1)
# vocabに登録されない単語をあとで追加する
tokens = tokens + add_tokens
model.wv.vocab

{'NHK': <gensim.models.keyedvectors.Vocab at 0x10d17a400>,
 'あなた': <gensim.models.keyedvectors.Vocab at 0x10b025b00>,
 'した': <gensim.models.keyedvectors.Vocab at 0x10b025be0>,
 'らく賃': <gensim.models.keyedvectors.Vocab at 0x10d2e2da0>,
 'パソコン': <gensim.models.keyedvectors.Vocab at 0x10d2c6518>,
 '今日': <gensim.models.keyedvectors.Vocab at 0x10b025c18>,
 '営業': <gensim.models.keyedvectors.Vocab at 0x10d263e48>,
 '壊す': <gensim.models.keyedvectors.Vocab at 0x10b025c50>,
 '天気': <gensim.models.keyedvectors.Vocab at 0x10d263a58>,
 '担当者': <gensim.models.keyedvectors.Vocab at 0x10b025b70>,
 '晴れ': <gensim.models.keyedvectors.Vocab at 0x10b025d30>,
 '誰': <gensim.models.keyedvectors.Vocab at 0x10b025ba8>}

In [2]:
for i, v in enumerate(tokens):
    print(i, v)

0 ['らく賃', '担当者', '誰']
1 ['NHK', '担当者', '誰']
2 ['パソコン', '担当者', '誰']
3 ['営業', '担当者', '誰']
4 ['今日', '天気', '晴れ']
5 ['あなた', 'パソコン', '壊す', 'した']
6 ['ハウスコム', '担当', '誰']
7 ['雨', '降る', '都市']
8 ['シータ', '担当', '誰']


In [3]:
from gensim.similarities import WmdSimilarity

wmd = WmdSimilarity(tokens, model.wv, num_best=10)

In [4]:
wmd[['今日', '天気', '雨']]

[(4, 0.68704560115561886),
 (5, 0.42023935849543298),
 (1, 0.42022590182637543),
 (0, 0.41805943781947158),
 (2, 0.41619713997288266),
 (6, 0.41571665239292749),
 (8, 0.41571665239292749),
 (3, 0.41430921916305696)]

In [5]:
wmd[['営業', '担当者', '誰']]

[(3, 1.0),
 (2, 0.68061603996031361),
 (0, 0.66797254822508778),
 (1, 0.66068406272664593),
 (6, 0.51299115659824612),
 (8, 0.51299115659824612),
 (5, 0.42098044042256116),
 (4, 0.41435237418029619)]

In [6]:
sim1 = wmd[['ハウスコム', '担当者', '誰']]
sim1

[(0, 0.68376281084565471),
 (3, 0.68197202719555461),
 (1, 0.68122908930812476),
 (2, 0.68082096878287812),
 (6, 0.57433511305204843),
 (8, 0.57433511305204843),
 (4, 0.41697747434257598),
 (5, 0.41589424961603599)]

In [7]:
model.build_vocab(add_tokens, update=True)
model.wv.vocab

{'NHK': <gensim.models.keyedvectors.Vocab at 0x10d17a400>,
 'あなた': <gensim.models.keyedvectors.Vocab at 0x10b025b00>,
 'した': <gensim.models.keyedvectors.Vocab at 0x10b025be0>,
 'らく賃': <gensim.models.keyedvectors.Vocab at 0x10d2e2da0>,
 'シータ': <gensim.models.keyedvectors.Vocab at 0x10b0255c0>,
 'ハウスコム': <gensim.models.keyedvectors.Vocab at 0x10b0256a0>,
 'パソコン': <gensim.models.keyedvectors.Vocab at 0x10d2c6518>,
 '今日': <gensim.models.keyedvectors.Vocab at 0x10b025c18>,
 '営業': <gensim.models.keyedvectors.Vocab at 0x10d263e48>,
 '壊す': <gensim.models.keyedvectors.Vocab at 0x10b025c50>,
 '天気': <gensim.models.keyedvectors.Vocab at 0x10d263a58>,
 '担当': <gensim.models.keyedvectors.Vocab at 0x10b025518>,
 '担当者': <gensim.models.keyedvectors.Vocab at 0x10b025b70>,
 '晴れ': <gensim.models.keyedvectors.Vocab at 0x10b025d30>,
 '誰': <gensim.models.keyedvectors.Vocab at 0x10b025ba8>,
 '都市': <gensim.models.keyedvectors.Vocab at 0x10b0258d0>,
 '降る': <gensim.models.keyedvectors.Vocab at 0x10b0256d8>,
 '雨':

In [8]:
wmd2 = WmdSimilarity(tokens, model.wv, num_best=10)
sim2 = wmd2[['ハウスコム', '担当者', '誰']]
sim2

[(3, 0.68296789697174864),
 (2, 0.67661758903888269),
 (6, 0.66309490029838059),
 (0, 0.65615384683845934),
 (1, 0.65428452487797861),
 (8, 0.52569002489342975),
 (4, 0.4201626171953331),
 (5, 0.41779377114491612),
 (7, 0.41509462028740246)]

うまく行かなかった

## fastTextでも試してみる

In [9]:
from gensim.models.fasttext import FastText

ft = FastText(tokens, min_count=1)
ft.wv.vocab

{'NHK': <gensim.models.keyedvectors.Vocab at 0x10d3065c0>,
 'あなた': <gensim.models.keyedvectors.Vocab at 0x10d3068d0>,
 'した': <gensim.models.keyedvectors.Vocab at 0x10d3063c8>,
 'らく賃': <gensim.models.keyedvectors.Vocab at 0x10d3069b0>,
 'シータ': <gensim.models.keyedvectors.Vocab at 0x10d306748>,
 'ハウスコム': <gensim.models.keyedvectors.Vocab at 0x10d306630>,
 'パソコン': <gensim.models.keyedvectors.Vocab at 0x10d306978>,
 '今日': <gensim.models.keyedvectors.Vocab at 0x10d306940>,
 '営業': <gensim.models.keyedvectors.Vocab at 0x10d306908>,
 '壊す': <gensim.models.keyedvectors.Vocab at 0x10d3067b8>,
 '天気': <gensim.models.keyedvectors.Vocab at 0x10d306550>,
 '担当': <gensim.models.keyedvectors.Vocab at 0x10d3061d0>,
 '担当者': <gensim.models.keyedvectors.Vocab at 0x10d306358>,
 '晴れ': <gensim.models.keyedvectors.Vocab at 0x10d306780>,
 '誰': <gensim.models.keyedvectors.Vocab at 0x10d306390>,
 '都市': <gensim.models.keyedvectors.Vocab at 0x10d3066a0>,
 '降る': <gensim.models.keyedvectors.Vocab at 0x10d306470>,
 '雨':

In [10]:
ft_wmd = WmdSimilarity(tokens, ft.wv, num_best=10)

In [11]:
ft_wmd[['今日', '天気', '雨']]

[(4, 0.69145699614667888),
 (7, 0.51195537646029243),
 (3, 0.4250534667048862),
 (2, 0.4246838361758315),
 (8, 0.42345340384927677),
 (5, 0.42253090139887278),
 (6, 0.42121676386405643),
 (1, 0.42113327558094021),
 (0, 0.4201475017110804)]

In [12]:
ft_wmd[['営業', '担当者', '誰']]

[(3, 1.0),
 (0, 0.68282325370155317),
 (1, 0.68245570600744221),
 (2, 0.66490962040567869),
 (8, 0.54078081044998683),
 (6, 0.52920984604441657),
 (5, 0.42532976337983863),
 (4, 0.42128467111341),
 (7, 0.42087137422751869)]

In [13]:
ft_sim1 = ft_wmd[['ハウスコム', '担当者', '誰']]
ft_sim1

[(6, 0.70366444296287634),
 (0, 0.70156887296119619),
 (3, 0.68097732765106977),
 (2, 0.66979634869876514),
 (1, 0.66569929231130931),
 (8, 0.53744106574999362),
 (5, 0.43141169672901297),
 (7, 0.42238419005841726),
 (4, 0.41934127221494194)]

fastTextだとやはり多少未知語に強いようだ、あやういが予測できている

In [14]:
ft.build_vocab(add_tokens, update=True)
ft.wv.vocab

{'NHK': <gensim.models.keyedvectors.Vocab at 0x10d3065c0>,
 'あなた': <gensim.models.keyedvectors.Vocab at 0x10d3068d0>,
 'した': <gensim.models.keyedvectors.Vocab at 0x10d3063c8>,
 'らく賃': <gensim.models.keyedvectors.Vocab at 0x10d3069b0>,
 'シータ': <gensim.models.keyedvectors.Vocab at 0x10d306748>,
 'ハウスコム': <gensim.models.keyedvectors.Vocab at 0x10d306630>,
 'パソコン': <gensim.models.keyedvectors.Vocab at 0x10d306978>,
 '今日': <gensim.models.keyedvectors.Vocab at 0x10d306940>,
 '営業': <gensim.models.keyedvectors.Vocab at 0x10d306908>,
 '壊す': <gensim.models.keyedvectors.Vocab at 0x10d3067b8>,
 '天気': <gensim.models.keyedvectors.Vocab at 0x10d306550>,
 '担当': <gensim.models.keyedvectors.Vocab at 0x10d3061d0>,
 '担当者': <gensim.models.keyedvectors.Vocab at 0x10d306358>,
 '晴れ': <gensim.models.keyedvectors.Vocab at 0x10d306780>,
 '誰': <gensim.models.keyedvectors.Vocab at 0x10d306390>,
 '都市': <gensim.models.keyedvectors.Vocab at 0x10d3066a0>,
 '降る': <gensim.models.keyedvectors.Vocab at 0x10d306470>,
 '雨':

In [15]:
ft_wmd2 = WmdSimilarity(tokens, ft.wv, num_best=10)
ft_sim2 = wmd2[['ハウスコム', '担当者', '誰']]
ft_sim2

[(3, 0.68296789697174864),
 (2, 0.67661758903888269),
 (6, 0.66309490029838059),
 (0, 0.65615384683845934),
 (1, 0.65428452487797861),
 (8, 0.52569002489342975),
 (4, 0.4201626171953331),
 (5, 0.41779377114491612),
 (7, 0.41509462028740246)]

ボキャブラリを入れることでfastTextも変化したが、結果は悪くなった