In [291]:
import pickle
import numpy as np
import gensim
from pathlib import Path
from gensim.models.word2vec import Word2Vec
from tokenizers import Tokenizer, decoders, pre_tokenizers

In [292]:
train_ch = Path("../dataset/ASPEC-JC/train/filtered_ch.txt")
train_jp = Path("../dataset/ASPEC-JC/train/filtered_jp.txt")

In [293]:
tokenizer_ch = Path("../tokenizer/ch_tokenizer.json")
tokenizer_jp = Path("../tokenizer/jp_tokenizer.json")

tokenizer_ch = Tokenizer.from_file(str(tokenizer_ch))
tokenizer_jp = Tokenizer.from_file(str(tokenizer_jp))

In [294]:
print(tokenizer_ch.get_vocab_size())
print(tokenizer_jp.get_vocab_size())

32000
32000


# Tokenize raw sentences

In [295]:
# with open(train_ch) as f:
#     ch_texts = [tokenizer_ch.encode(line).tokens[1:-1] for line in f.readlines()]

# with open(train_jp) as f:
#     jp_texts = [tokenizer_jp.encode(line).tokens[1:-1] for line in f.readlines()]

# with open("tokenized_sentences/ch.txt", "wb") as f:
#     pickle.dump(ch_texts, f)

# with open("tokenized_sentences/jp.txt", "wb") as f:
#     pickle.dump(jp_texts, f)

In [296]:
with open("tokenized_sentences/ch.txt", "rb") as f:
    ch_texts = pickle.load(f)

with open("tokenized_sentences/jp.txt", "rb") as f:
    jp_texts = pickle.load(f)

# Helper Funcs

In [297]:
def check_coverage(vocab, embedding):
    """
    vocab = vocabulary from tokenizer, input=[(voc, index)]
    embedding = word2vec embedding
    """
    count = 0
    for word in vocab.keys():
        try:
            if embedding[word] is not None:
                count += 1
        except:
            pass

    print(f"{count / len(vocab):.0%} ({count}/{len(vocab)}) is covered.")

In [298]:
def build_embedding_matrix(vocab, embedding):
    """
    vocab = vocabulary from tokenizer, input=[(voc, index)]
    embedding = word2vec embedding
    """
    embed_matrix = np.zeros((len(vocab), 300))
    
    for word, i in vocab.items():
        if word in embedding:
            embed_matrix[i] = embedding[word]
            
    return embed_matrix

# Train Semantic Embedding

In [365]:
import re

# if any chinese + japanese
reg = re.compile(r'[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]')

In [373]:
def my_rule(word, count, min_count):
    if reg.search(word):
        return gensim.utils.RULE_DEFAULT
    else:
        return gensim.utils.RULE_DISCARD

In [374]:
def build_word2vec(tokenized_corpus, vector_size=300, max_vocab_size=32000):
    return Word2Vec(
        tokenized_corpus,
        vector_size=vector_size,
        max_vocab_size=max_vocab_size,
        sg=1,
        hs=0,
        negative=5,
        workers=32,
        min_count=5,
        trim_rule=my_rule,
        epochs=5,
    )

## Chinese

In [375]:
ch_model = build_word2vec(ch_texts)

ch_model.save("word2vec/semantic/ch_word2vec")
ch_model = Word2Vec.load("word2vec/semantic/ch_word2vec")

In [376]:
print(ch_model.wv.vectors.shape)

(27382, 300)


In [385]:
check_coverage(tokenizer_ch.get_vocab(), ch_model.wv)

86% (27382/32000) is covered.


In [378]:
ch_semantic_embedding = build_embedding_matrix(tokenizer_ch.get_vocab(), ch_model.wv)
np.save("word2vec/semantic/ch_embedding.npy", ch_semantic_embedding)
ch_semantic_embedding = np.load("word2vec/semantic/ch_embedding.npy")
ch_semantic_embedding.shape

(32000, 300)

## Japanese

In [386]:
jp_model = build_word2vec(jp_texts)

jp_model.save("word2vec/semantic/jp_word2vec")
jp_model = Word2Vec.load("word2vec/semantic/jp_word2vec")

In [387]:
print(jp_model.wv.vectors.shape)

(28313, 300)


In [388]:
check_coverage(tokenizer_jp.get_vocab(), jp_model.wv)

88% (28313/32000) is covered.


In [389]:
jp_semantic_embedding = build_embedding_matrix(tokenizer_jp.get_vocab(), jp_model.wv)
np.save("word2vec/semantic/jp_embedding.npy", jp_semantic_embedding)
jp_semantic_embedding = np.load("word2vec/semantic/jp_embedding.npy")
jp_semantic_embedding.shape

(32000, 300)

# Train Phonetic Embedding

## Chinese

In [425]:
from dragonmapper import hanzi
[(hanzi.to_zhuyin(c), c) for c in ch_texts[2]]

[('_ㄋㄨㄥˊ ㄧㄝˋ', '_农业'),
 ('ㄗㄨㄛˋ ㄨㄟˊ', '作为'),
 ('ㄅㄟˇ ㄏㄞˇ ㄉㄠˋ', '北海道'),
 ('ㄉㄜ˙ ㄐㄧ ㄔㄨˇ', '的基础'),
 ('ㄔㄢˇ ㄧㄝˋ', '产业'),
 ('_,', '_,'),
 ('_ㄊㄚ ㄉㄜ˙', '_它的'),
 ('ㄏㄨㄛˊ ㄒㄧㄥˋ ㄏㄨㄚˋ', '活性化'),
 ('ㄉㄨㄟˋ ㄩˊ', '对于'),
 ('ㄉㄧˋ ㄑㄩ', '地区'),
 ('ㄐㄧㄥ ㄐㄧˋ ㄉㄜ˙', '经济的'),
 ('ㄏㄨㄛˊ ㄒㄧㄥˋ ㄏㄨㄚˋ', '活性化'),
 ('ㄌㄞˊ ㄕㄨㄛ', '来说'),
 ('ㄕˋ ㄅㄧˋ ㄧㄠˋ ㄉㄜ˙', '是必要的'),
 ('_,', '_,'),
 ('_ㄉㄢˋ ㄕˋ', '_但是'),
 ('ㄓㄜˋ ㄧ', '这一'),
 ('ㄌㄧㄥˇ ㄩˋ ㄉㄜ˙', '领域的'),
 ('ㄔㄢˇ', '产'),
 ('ㄒㄩㄝˊ', '学'),
 ('ㄌㄧㄢˊ ㄏㄜˊ', '联合'),
 ('ㄘㄨㄣˊ ㄗㄞˋ ㄓㄜ˙', '存在着'),
 ('ㄐㄧˇ ㄍㄜ˙', '几个'),
 ('ㄊㄜˋ ㄕㄨ', '特殊'),
 ('ㄨㄣˋ ㄊㄧˊ', '问题'),
 ('_。', '_。')]

## Japanese

In [426]:
import pykakasi

In [430]:
kks = pykakasi.kakasi()
def to_hira(kanji):
    return "".join([item["hira"] for item in kks.convert(kanji)])

In [433]:
[(to_hira(c), c) for c in jp_texts[1]]

[('_い', '_異'),
 ('ぎょう', '業'),
 ('たね', '種'),
 ('ねっとわーく', 'ネットワーク'),
 ('からの', 'からの'),
 ('ちいき', '地域'),
 ('ぶ', 'ブ'),
 ('らんど', 'ランド'),
 ('か', '化'),
 ('_−', '_−'),
 ('_はいきぶつ', '_廃棄物'),
 ('じゅんかん', '循環'),
 ('かた', '型'),
 ('のうぎょう', '農業'),
 ('の', 'の'),
 ('じぎょう', '事業'),
 ('か', '化'),
 ('せんりゃく', '戦略'),
 ('_−', '_−')]