In [66]:
import re
import argparse
from string import punctuation

import torch
import yaml
import numpy as np
from torch.utils.data import DataLoader
from pypinyin import pinyin, Style

from utils.model import get_model, get_vocoder
from utils.tools import to_device, synth_samples
from dataset import TextDataset
from text import phoneme_to_ids

In [67]:
def read_lexicon(lex_path):
    lexicon = {}
    with open(lex_path) as f:
        for line in f:
            temp = re.split(r"\s+", line.strip("\n"))
            word = temp[0]
            phones = temp[1:]
            if word.lower() not in lexicon:
                lexicon[word.lower()] = phones
    return lexicon

In [68]:
path = "config/preprocess.yaml"
with open(path, "r", encoding="utf-8") as f:
    preprocess_config = yaml.load(f,yaml.loader.SafeLoader)
lexicon = read_lexicon(preprocess_config["path"]["lexicon_path"])

In [69]:
text = "tôi tên là dương văn tuyển nhé"

text = text.lower()
words = re.split(r"([,;.\-\?\!\s+])", text)

In [70]:
phonemes = []
for word in words:
    if word in lexicon:
        phoneme = lexicon[word]
        phonemes += phoneme
    elif len(word.strip()) == 0:
        continue
    else:
        phonemes.append(word)

In [71]:
phonemes = "tɕ ɨə˦˥ k̚ x iː˨˨ l aː˨˨ m m oː˨˩ t̚ v iə˨˩ k̚ tʰ a˨˨ ɲ k o˨˨ ŋm h a˨ˀ˥ j k oː˨˦ tɕ əː˨˩˨ tʰ a˨˨ ɲ m oː˨˩ t̚ ŋ ɨə˨˨ j tʰ a˨˨ ɲ k o˨˨ ŋm".split()
# phonemes = "ʔ a˨˥ n".split()

In [72]:
phoneme_ids = phoneme_to_ids(" ".join(phonemes))

In [73]:
phoneme_ids = np.array(phoneme_ids).reshape(1, len(phoneme_ids))

In [74]:
preprocess_config = "config/preprocess.yaml"
model_config = "config/model.yaml"
train_config = "config/train.yaml"
preprocess_config = yaml.load(open(preprocess_config, "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open(model_config, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(train_config, "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)

In [75]:
class args:
    restore_step = 76000

In [76]:
_args = args()

In [77]:
model = get_model(_args, configs, device="cpu", train=False)
vocoder = get_vocoder(model_config, device="cpu")

Removing weight norm...


In [78]:
text_lens = np.array([len(phoneme_ids[0])])
ids = [text[0:100],]
batch = [("test", text, phoneme_ids, text_lens, max(text_lens))]

In [79]:
output = model(
        texts = torch.tensor(phoneme_ids),
        src_lens = torch.tensor(text_lens),
        max_src_len = max(text_lens)
    )

In [80]:
synth_samples(
        batch,
        output,
        vocoder,
        model_config,
        preprocess_config,
        train_config["path"]["result_path"],
    )