In [46]:
import os
import pickle
import numpy as np
import gensim
import wandb
from pathlib import Path
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from opencc import OpenCC
from utils.custom_tokenizer import load_jieba_tokenizer, load_janome_tokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [27]:
cc = OpenCC("t2s")

In [2]:
run = wandb.init(project='phonetic-translation', 
                 entity='windsuzu',
                 group="word2vec",
                 job_type="analyze")

[34m[1mwandb[0m: Currently logged in as: [33mwindsuzu[0m (use `wandb login --relogin` to force relogin)


In [7]:
word2vec_art = run.use_artifact("language_specific_word2vec:latest")
word2vec_dir = word2vec_art.download()

ch_word2vec_dir = Path(word2vec_dir) / "ch.word2vec"
chp_word2vec_dir = Path(word2vec_dir) / "chp.word2vec"
jp_word2vec_dir = Path(word2vec_dir) / "jp.word2vec"
jpp_word2vec_dir = Path(word2vec_dir) / "jpp.word2vec"

[34m[1mwandb[0m: Downloading large artifact language_specific_word2vec:latest, 254.19MB. 4 files... Done. 0:0:0


In [14]:
ch_word2vec = Word2Vec.load(str(ch_word2vec_dir)).wv
chp_word2vec = Word2Vec.load(str(chp_word2vec_dir)).wv
jp_word2vec = Word2Vec.load(str(jp_word2vec_dir)).wv
jpp_word2vec = Word2Vec.load(str(jpp_word2vec_dir)).wv

In [20]:
chp_word2vec.most_similar("ㄋㄧˇ")

[('ㄌㄞˊ ㄉㄠˋ', 0.337485671043396),
 ('ㄇㄧˊ', 0.3326432704925537),
 ('ㄋㄧㄣˊ', 0.3191660940647125),
 ('ㄒㄩㄝˋ ㄩ', 0.3157498836517334),
 ('ㄆㄤˋ', 0.31506600975990295),
 ('ㄒㄧㄣ ㄌㄧ˙', 0.3147471249103546),
 ('ㄔㄨㄣ˙', 0.31471967697143555),
 ('ㄇㄛ', 0.3123966157436371),
 ('ㄎㄨˇ', 0.3078014552593231),
 ('ㄨㄛˇ', 0.30537787079811096)]

In [28]:
ch_word2vec.most_similar(cc.convert("這"))

[('上述', 0.4571683704853058),
 ('这些', 0.4495928883552551),
 ('这一', 0.43432921171188354),
 ('这个', 0.4288410246372223),
 ('这样', 0.42115911841392517),
 ('有', 0.4114645719528198),
 ('即', 0.4106895625591278),
 ('那', 0.4105702042579651),
 ('这种', 0.4095064699649811),
 ('它', 0.40909796953201294)]

In [32]:
jp_word2vec.most_similar("か")

[('どう', 0.8308656215667725),
 ('否', 0.7216336131095886),
 ('かの', 0.6315698623657227),
 ('たか', 0.5826131105422974),
 ('かも', 0.5628167986869812),
 ('かが', 0.5509108901023865),
 ('どの', 0.48146137595176697),
 ('かに', 0.4794462323188782),
 ('うるか', 0.463559627532959),
 ('どれ', 0.4630514681339264)]

In [33]:
jpp_word2vec.most_similar("か")

[('かの', 0.4391637146472931),
 ('が', 0.4341663420200348),
 ('かが', 0.42150723934173584),
 ('な', 0.4179091453552246),
 ('を', 0.41109925508499146),
 ('も', 0.40225765109062195),
 ('に', 0.4002080261707306),
 ('する', 0.3997371196746826),
 ('こと', 0.39620280265808105),
 ('てき', 0.395681768655777)]

In [36]:
embedding_art = run.use_artifact("language_specific_embedding:latest")
embedding_dir = word2vec_art.download()

tokenizer_art = run.use_artifact("language_specific:latest")
tokenizer_dir = tokenizer_art.download()

[34m[1mwandb[0m: Downloading large artifact language_specific_embedding:latest, 732.42MB. 8 files... Done. 0:0:0


In [48]:
ch_tokenzier = load_jieba_tokenizer(Path(tokenizer_dir) / "jieba_tokenizer.json")
ch_meta_embedding = np.load(Path(embedding_dir) / "ch_meta_embedding.npy")

In [72]:
vocab = [None]*32000
for k, i in jieba_tokenzier.get_vocab().items():
    vocab[i] = k

In [74]:
chm_word2vec = KeyedVectors(300)
chm_word2vec.add_vectors(vocab, ch_meta_embedding)

# for key, idx in jieba_tokenzier.get_vocab().items():
#     chm_word2vec.add_vector(key, ch_meta_embedding[idx])

In [79]:
chm_word2vec.most_similar(cc.convert("這"))

[('浙', 0.5726149082183838),
 ('有', 0.5041649341583252),
 ('这一', 0.4879700839519501),
 ('这个', 0.4818565845489502),
 ('这种', 0.4598851799964905),
 ('这是', 0.4543885290622711),
 ('上述', 0.4490727484226227),
 ('这些', 0.4464312195777893),
 ('这样', 0.44638335704803467),
 ('它', 0.4336070716381073)]