In [9]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from wikipedia2vec import Wikipedia2Vec
from tqdm import tqdm

In [2]:
# load wikipedia2vec model
en_w2v = Wikipedia2Vec.load("../model/enwiki_20180420_300d.pkl")
ja_w2v = Wikipedia2Vec.load("../model/jawiki_20180420_300d.pkl")

In [3]:
# load word pair dict
pair_df = pd.read_csv("../data/title_pair.csv")
print(len(pair_df))
pair_df.head()

244517


Unnamed: 0,ja,en
0,ベルギー,Belgium
1,幸福,Happiness
2,ジョージ・ワシントン,George Washington
3,ジャック・バウアー,Jack Bauer
4,ダグラス・アダムズ,Douglas Adams


In [10]:
# transfer word to vector

en_emb_list = np.empty((0, 300))
ja_emb_list = np.empty((0, 300))
tmp_en_emb_list = np.empty((0, 300))
tmp_ja_emb_list = np.empty((0, 300))

for i, (ja_word, en_word) in tqdm.tqdm(pair_df.iterrows()):
    try:
        en_emb = en_w2v.get_entity_vector(en_word)
        ja_emb = ja_w2v.get_entity_vector(ja_word)
        
        tmp_en_emb_list = np.concatenate([tmp_en_emb_list, [en_emb]], axis=0)
        tmp_ja_emb_list = np.concatenate([tmp_ja_emb_list, [ja_emb]], axis=0)
    except KeyError:
        pass
    
    if i % 5000 is 0:
        en_emb_list = np.concatenate([en_emb_list, tmp_en_emb_list], axis=0)
        ja_emb_list = np.concatenate([ja_emb_list, tmp_ja_emb_list], axis=0)
        tmp_en_emb_list = np.empty((0, 300))
        tmp_ja_emb_list = np.empty((0, 300))


0it [00:00, ?it/s][A
389it [00:00, 3884.87it/s][A
676it [00:00, 3512.13it/s][A
909it [00:00, 3044.33it/s][A
1121it [00:00, 2687.21it/s][A
1316it [00:00, 2254.81it/s][A
1502it [00:00, 1907.39it/s][A
1674it [00:00, 1803.21it/s][A
1843it [00:00, 1610.87it/s][A
2000it [00:01, 1432.62it/s][A
2144it [00:01, 1201.96it/s][A
2271it [00:01, 987.68it/s] [A
2381it [00:01, 880.77it/s][A
2479it [00:01, 755.99it/s][A
2565it [00:01, 639.71it/s][A
2640it [00:02, 613.71it/s][A
2709it [00:02, 573.45it/s][A
2783it [00:02, 613.93it/s][A
2851it [00:02, 629.55it/s][A
2918it [00:02, 629.01it/s][A
2984it [00:02, 632.48it/s][A
3049it [00:02, 623.36it/s][A
3118it [00:02, 638.49it/s][A
3195it [00:02, 670.34it/s][A
3264it [00:02, 655.28it/s][A
3331it [00:03, 654.61it/s][A
3398it [00:03, 587.52it/s][A

KeyboardInterrupt: 

In [44]:
en_emb_list[:10][0]

array([ 0.04199366, -0.60735202, -0.36353555,  0.55984443, -0.92610371,
       -0.10738079,  0.44493377,  0.3870979 ,  0.21835753, -0.44637027,
        0.2812449 ,  0.28480837, -0.31673071,  0.46639094,  0.31210932,
        0.08475059, -0.78310657, -0.04274072, -0.70363504, -0.17509326,
       -0.26412761,  0.10655801, -0.23898382, -0.52869177, -0.33934885,
       -0.8641175 , -0.89067405, -0.08751788, -0.38996187, -0.00936786,
       -0.6905877 , -0.5043236 ,  0.11451458, -0.16271381, -0.51021987,
        0.1227823 ,  0.08466765,  0.0481359 , -0.24872038, -0.12288736,
        0.49604076,  0.17922913, -0.20951915, -0.10902337, -1.63814545,
       -0.32620218, -0.40836349,  0.21626809, -0.05882427,  0.25855434,
        0.20233738,  0.18874514, -0.21631315,  0.55183357,  0.51564914,
        0.02362371,  0.62603164,  0.03196934, -0.45830867, -0.14025842,
       -0.25002351, -0.02315799, -0.53689247, -0.96613908,  0.16132088,
       -0.1089105 , -0.6492762 , -1.18328333,  0.09193588,  0.00

In [None]:
# fit transfer matrix
model = LinearRegression()
model.fit(X=en_emb_list, y=ja_emb_list)

In [37]:
# Test
# input English word
input_word = "Miss. Kobayashi's Dragon Maid"
input_vec = en_w2v.get_entity_vector(input_word)
output_vec = model.predict([input_vec])[0]
ja_w2v.most_similar_by_vector(output_vec)[:10]

[(<Entity オキソリン酸>, 0.7555915449362515),
 (<Entity ノルフロキサシン>, 0.7436902074687589),
 (<Entity ピルビン酸デヒドロゲナーゼ (アセチル基転移)>, 0.7303942425794281),
 (<Entity 加水分解コムギ>, 0.7293561607706759),
 (<Entity 歯科用ヨード・グリセリン>, 0.7277512937548961),
 (<Entity プロテウス菌>, 0.7273846029213599),
 (<Entity プロベナゾール>, 0.7269191762401745),
 (<Entity ペンタクロロフェノール>, 0.7268611448069725),
 (<Entity ペンタクロロベンゼン>, 0.7260605008378938),
 (<Entity ジベカシン>, 0.7238942176608539)]