In [86]:
import pickle
import numpy as np
import gensim
import wandb
from pathlib import Path
from gensim.models.word2vec import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tokenizers import Tokenizer, decoders, pre_tokenizers

In [87]:
run = wandb.init(project='phonetic-translation', 
                 entity='windsuzu',
                 group="embedding",
                 name="language_specific",  # or sentencepiece
                 job_type="word2vec")

# Path Preparation

## Path of Train Data + Tokenizer

In [88]:
# Download Raw Data
train_data_art = run.use_artifact("train:latest")
train_data_dir = train_data_art.download()

train_ch = Path(train_data_dir) / "ch.txt"
train_jp = Path(train_data_dir) / "jp.txt"

wandb: Downloading large artifact train:latest, 205.64MB. 2 files... Done. 0:0:0


In [89]:
# Download Tokenizers
tokenizer_art = run.use_artifact("language_specific:latest")  # or sentencepiece:latest
tokenizer_dir = tokenizer_art.download()

tokenizer_ch_dir = Path(tokenizer_dir) / "jieba_tokenizer.json"  # or ch_tokenizer.json
tokenizer_jp_dir = Path(tokenizer_dir) / "janome_tokenizer.json"  # or jp_tokenizer.json

tokenizer_ch = Tokenizer.from_file(str(tokenizer_ch_dir))
tokenizer_jp = Tokenizer.from_file(str(tokenizer_jp_dir))

## Path of Tokenized Train Data

In [90]:
tokenized_path = Path("../tokenized_sentences/language_specific")  # or sentencepiece
tokenized_path.mkdir(parents=True, exist_ok=True)

In [91]:
tokenized_ch_path = tokenized_path / "ch.txt"
tokenized_jp_path = tokenized_path / "jp.txt"

In [92]:
tokenized_ch_phonetic_path = tokenized_path / "chp.txt"
tokenized_jp_phonetic_path = tokenized_path / "jpp.txt"

## Path of Word2Vec Embedding

In [93]:
word2vec_path = Path("../word2vec/language_specific/")  # or sentencepiece
word2vec_path.mkdir(parents=True, exist_ok=True)

In [94]:
semantic_path = word2vec_path / "semantic/"
semantic_path.mkdir(parents=True, exist_ok=True)

ch_semantic_path = semantic_path / "ch_word2vec"
jp_semantic_path = semantic_path / "jp_word2vec"

ch_semantic_embedding_path = semantic_path / "ch_embedding.npy"
jp_semantic_embedding_path = semantic_path / "jp_embedding.npy"

In [95]:
phonetic_path = word2vec_path / "phonetic/"
phonetic_path.mkdir(parents=True, exist_ok=True)

ch_phonetic_path = phonetic_path / "chp_word2vec"
jp_phonetic_path = phonetic_path / "jpp_word2vec"

ch_phonetic_embedding_path = phonetic_path / "chp_embedding.npy"
jp_phonetic_embedding_path = phonetic_path / "jpp_embedding.npy"

# Tokenize raw sentences

In [96]:
if tokenized_ch_path.exists():
    with open(tokenized_ch_path, "rb") as f:
        ch_texts = pickle.load(f)
else:
    with open(train_ch, encoding="utf8") as f:
        ch_texts = [tokenizer_ch.encode(line).tokens[1:-1] for line in f.readlines()]
    
    with open(tokenized_ch_path, "wb") as f:
         pickle.dump(ch_texts, f)

In [97]:
if tokenized_jp_path.exists():
    with open(tokenized_jp_path, "rb") as f:
        jp_texts = pickle.load(f)
else:
    with open(train_jp, encoding="utf8") as f:
        jp_texts = [tokenizer_jp.encode(line).tokens[1:-1] for line in f.readlines()]
    
    with open(tokenized_jp_path, "wb") as f:
         pickle.dump(jp_texts, f)

# Tokenize Phonetic Sentences

## Chinese

In [98]:
from dragonmapper import hanzi

def to_zhuyin(word):
    try:
        word = hanzi.to_zhuyin(word)
    except:
        pass
    finally:
        return word

def generate_ch_phonetic_sentences(ch_texts):
    return [[to_zhuyin(s) for s in line] for line in ch_texts]

In [99]:
if tokenized_ch_phonetic_path.exists():
    with open(tokenized_ch_phonetic_path, 'rb') as f:
        chp_texts = pickle.load(f)
else:
    chp_texts = generate_ch_phonetic_sentences(ch_texts)
    with open(tokenized_ch_phonetic_path, 'wb') as f:
        pickle.dump(chp_texts, f)

## Japanese

In [100]:
import pykakasi
kks = pykakasi.kakasi()

def to_hira(kanji):
    return "".join([item["hira"] for item in kks.convert(kanji)])

def generate_jp_phonetic_sentences(jp_texts):
    return [[to_hira(s) for s in line] for line in jp_texts]

In [101]:
if tokenized_jp_phonetic_path.exists():
    with open(tokenized_jp_phonetic_path, 'rb') as f:
        jpp_texts = pickle.load(f)
else:
    jpp_texts = generate_jp_phonetic_sentences(jp_texts)
    with open(tokenized_jp_phonetic_path, 'wb') as f:
        pickle.dump(jpp_texts, f)

# Embedding Helper Functions

In [102]:
def check_coverage(vocab, embedding, map_func=None):
    """
    vocab = vocabulary from tokenizer, input=[(voc, index)]
    embedding = word2vec embedding
    """
    count = 0
    for word in vocab.keys():
        if map_func:
            word = map_func(word)
        try:
            if embedding[word] is not None:
                count += 1
        except:
            pass

    print(f"{count / len(vocab):.0%} ({count}/{len(vocab)}) is covered.")

In [103]:
def build_embedding_matrix(vocab, embedding, map_func=None):
    """
    vocab = vocabulary from tokenizer, input=[(voc, index)]
    embedding = word2vec embedding
    """
    mean = embedding.vectors.mean()
    std = embedding.vectors.std()
    print(f"Mean of embedding: {mean}")
    print(f"std of embedding: {std}")
    
    embed_matrix = np.random.default_rng(42).normal(mean, std, size=(len(vocab), 300))
    
    for word, i in vocab.items():
        if map_func:
            word = map_func(word)
            
        if word in embedding:
            embed_matrix[i] = embedding[word]
            
    return embed_matrix

In [104]:
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''
    
    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0
        
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print(f'Loss after epoch {self.epoch}: {loss_now}')
        self.epoch += 1

In [105]:
def build_word2vec(tokenized_corpus, vector_size=300, max_vocab_size=32000, rule=None):
    return Word2Vec(
        tokenized_corpus,
        vector_size=vector_size,
        max_vocab_size=max_vocab_size,
        sg=1,
        hs=0,
        negative=5,
        workers=4,
        min_count=5,
        trim_rule=rule,
        epochs=13,
        seed=42,
        compute_loss=True,
        callbacks=[callback()]
    )

# Train Semantic Embedding

In [106]:
import re

# if any chinese + japanese
reg = re.compile(r'[\u3040-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff66-\uff9f]')

def semantic_rule(word, count, min_count):
    if reg.search(word):
        return gensim.utils.RULE_DEFAULT
    else:
        return gensim.utils.RULE_DISCARD

## Chinese

In [107]:
if ch_semantic_path.exists():
    ch_model = Word2Vec.load(str(ch_semantic_path))
else:
    ch_model = build_word2vec(ch_texts)
    ch_model.save(str(ch_semantic_path))

In [108]:
check_coverage(tokenizer_ch.get_vocab(), ch_model.wv)

93% (29613/32000) is covered.


In [109]:
if ch_semantic_embedding_path.exists():
    ch_semantic_embedding = np.load(ch_semantic_embedding_path)
else:
    ch_semantic_embedding = build_embedding_matrix(tokenizer_ch.get_vocab(), ch_model.wv)
    np.save(ch_semantic_embedding_path, ch_semantic_embedding)

ch_semantic_embedding.shape

Mean of embedding: -0.0014938187086954713
std of embedding: 0.2587750554084778


(32000, 300)

## Japanese

In [111]:
if jp_semantic_path.exists():
    jp_model = Word2Vec.load(str(jp_semantic_path))
else:
    jp_model = build_word2vec(jp_texts)
    jp_model.save(str(jp_semantic_path))

In [112]:
print(jp_model.wv.vectors.shape)

(29801, 300)


In [113]:
check_coverage(tokenizer_jp.get_vocab(), jp_model.wv)

93% (29801/32000) is covered.


In [114]:
if jp_semantic_embedding_path.exists():
    jp_semantic_embedding = np.load(jp_semantic_embedding_path)
else:
    jp_semantic_embedding = build_embedding_matrix(tokenizer_jp.get_vocab(), jp_model.wv)
    np.save(jp_semantic_embedding_path, jp_semantic_embedding)

jp_semantic_embedding.shape

Mean of embedding: -0.002687199506908655
std of embedding: 0.2520033121109009


(32000, 300)

# Train Phonetic Embedding

In [115]:
# https://stackoverflow.com/questions/16027450/is-there-a-way-to-know-whether-a-unicode-string-contains-any-chinese-japanese-ch
import unicodedata

def has_zhuyin(s):
    for c in s:
        try:
            if "BOPOMOFO" in unicodedata.name(c):
                return True
        except:
            return False

def has_hira(s):
    for c in s:
        try:
            if "HIRAGANA" in unicodedata.name(c):
                return True
        except:
            return False

def phonetic_rule(word, count, min_count):
    if has_zhuyin(word) or has_hira(word):
        return gensim.utils.RULE_DEFAULT
    else:
        return gensim.utils.RULE_DISCARD

## Chinese

In [116]:
if ch_phonetic_path.exists():
    chp_model = Word2Vec.load(str(ch_phonetic_path))
else:
    chp_model = build_word2vec(chp_texts)
    chp_model.save(str(ch_phonetic_path))

In [117]:
print(chp_model.wv.vectors.shape)

(25412, 300)


In [118]:
check_coverage(tokenizer_ch.get_vocab(), chp_model.wv, map_func=to_zhuyin)

97% (31068/32000) is covered.


In [119]:
if ch_phonetic_embedding_path.exists():
    ch_phonetic_embedding = np.load(ch_phonetic_embedding_path)
else:
    ch_phonetic_embedding = build_embedding_matrix(tokenizer_ch.get_vocab(), chp_model.wv, map_func=to_zhuyin)
    np.save(ch_phonetic_embedding_path, ch_phonetic_embedding)

ch_phonetic_embedding.shape

Mean of embedding: 0.0009710414451546967
std of embedding: 0.2592071294784546


(32000, 300)

## Japanese

In [120]:
if jp_phonetic_path.exists():
    jpp_model = Word2Vec.load(str(jp_phonetic_path))
else:
    jpp_model = build_word2vec(jpp_texts)
    jpp_model.save(str(jp_phonetic_path))

In [121]:
print(jpp_model.wv.vectors.shape)

(24742, 300)


In [122]:
check_coverage(tokenizer_jp.get_vocab(), jpp_model.wv, map_func=to_hira)

96% (30809/32000) is covered.


In [123]:
if jp_phonetic_embedding_path.exists():
    jp_phonetic_embedding = np.load(jp_phonetic_embedding_path)
else:
    jp_phonetic_embedding = build_embedding_matrix(tokenizer_jp.get_vocab(), jpp_model.wv, map_func=to_hira)
    np.save(jp_phonetic_embedding_path, jp_phonetic_embedding)

jp_phonetic_embedding.shape

Mean of embedding: -0.003116233041509986
std of embedding: 0.25073009729385376


(32000, 300)

# Concatenate Embedding

In [124]:
print(ch_semantic_embedding.shape)
print(jp_semantic_embedding.shape)

print(ch_phonetic_embedding.shape)
print(jp_phonetic_embedding.shape)

(32000, 300)
(32000, 300)
(32000, 300)
(32000, 300)


In [125]:
ch_embedding = np.concatenate([ch_phonetic_embedding, ch_semantic_embedding], axis=1)
ch_embedding.shape

(32000, 600)

In [126]:
jp_embedding = np.concatenate([jp_phonetic_embedding, jp_semantic_embedding], axis=1)
jp_embedding.shape

(32000, 600)

In [127]:
np.save(word2vec_path / "ch-embedding-concat.npy", ch_embedding)
np.save(word2vec_path / "jp-embedding-concat.npy", jp_embedding)

# Meta Embedding

In [128]:
beta = 0.5

In [129]:
ch_meta_embedding = np.mean(
    [(2 - (beta * 2)) * ch_semantic_embedding, (beta * 2) * ch_phonetic_embedding],
    axis=0,
)

In [130]:
ch_meta_embedding.shape

(32000, 300)

In [131]:
jp_meta_embedding = np.mean(
    [(2 - (beta * 2)) * jp_semantic_embedding, (beta * 2) * jp_phonetic_embedding],
    axis=0,
)

In [132]:
jp_meta_embedding.shape

(32000, 300)

In [133]:
np.save(word2vec_path / f"ch-embedding-meta-{beta=}.npy", ch_meta_embedding)
np.save(word2vec_path / f"jp-embedding-meta-{beta=}.npy", jp_meta_embedding)

# Save Embedding Artifact

In [134]:
word2vec_art = wandb.Artifact('language_specific_word2vec', "word2vec")  # or sentencepiece_word2vec
word2vec_art.add_file(ch_semantic_path, "ch.word2vec")
word2vec_art.add_file(jp_semantic_path, "jp.word2vec")
word2vec_art.add_file(ch_phonetic_path, "chp.word2vec")
word2vec_art.add_file(jp_phonetic_path, "jpp.word2vec")

run.log_artifact(word2vec_art)

<wandb.sdk.wandb_artifacts.Artifact at 0x2411dcaa040>

In [135]:
embedding_art = wandb.Artifact("language_specific_embedding", "embedding")  # or sentencepiece_embedding
embedding_art.add_file(ch_semantic_embedding_path, "ch_embedding.npy")
embedding_art.add_file(ch_phonetic_embedding_path, "chp_embedding.npy")

embedding_art.add_file(jp_semantic_embedding_path, "jp_embedding.npy")
embedding_art.add_file(jp_phonetic_embedding_path, "jpp_embedding.npy")

embedding_art.add_file(word2vec_path / "ch-embedding-concat.npy", "ch_concat_embedding.npy")
embedding_art.add_file(word2vec_path / "jp-embedding-concat.npy", "jp_concat_embedding.npy")

embedding_art.add_file(word2vec_path / "ch-embedding-meta-beta=0.5.npy", "ch_meta_embedding.npy")
embedding_art.add_file(word2vec_path / "jp-embedding-meta-beta=0.5.npy", "jp_meta_embedding.npy")

run.log_artifact(embedding_art)

<wandb.sdk.wandb_artifacts.Artifact at 0x2411e6649a0>

In [136]:
run.finish()

VBox(children=(Label(value=' 732.42MB of 732.42MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…