In [None]:
import librosa
import pandas as pd
from glob import glob
import numpy as np

In [None]:
mel_dir = "/data/codes/speech-to-text/stt/data/data_new/mel"
length = []
for file in glob(f'{mel_dir}/*.npy'):
    data = np.load(file)
    length.append(data.shape[0])
pd.DataFrame(length).hist(bins=1000)

In [None]:
mel_dir = "/data/codes/speech-to-text/stt/data/data_sale/mel"
length = []
for file in glob(f'{mel_dir}/*.npy'):
    data = np.load(file)
    length.append(data.shape[0])
pd.DataFrame(length).hist(bins=1000)

In [None]:
from torch.utils.data import DataLoader
import numpy as np
import argparse
import torch
import yaml
import re
import shutil

from utils.model import get_model, get_vocoder
from utils.tools import to_device, synth_samples
from text import phoneme_to_ids


In [None]:
def read_lexicon(lex_path):
    lexicon = {}
    with open(lex_path) as f:
        for line in f:
            temp = re.split(r"\s+", line.strip("\n"))
            word = temp[0]
            phones = temp[1:]
            if word.lower() not in lexicon:
                lexicon[word.lower()] = phones
    return lexicon

def text_to_phonemes(text, lexicon):
    text = text.lower()
    words = re.split(r"([,;.\-\?\!\s+])", text)
    
    phonemes = []
    for word in words:
        if word in lexicon:
            phoneme = lexicon[word]
            phonemes += phoneme
        elif len(word.strip()) == 0:
            continue
        elif word in ",.?!":
            phonemes.append(word)
    
    phoneme_ids = phoneme_to_ids(" ".join(phonemes))
    phoneme_ids = torch.tensor(phoneme_ids).reshape(1, len(phoneme_ids))
    
    src_lens = torch.tensor([len(phoneme_ids[0])])

    speakers = None
    texts = phoneme_ids
    src_lens = src_lens
    max_src_len = src_lens.max()
    mels=None
    mel_lens=None
    max_mel_len=None
    p_targets=None
    e_targets=None
    d_targets=None
    p_control=1.0
    e_control=1.0
    d_control=1.0

    batch = [
        speakers, text, texts, src_lens, \
        max_src_len, mels, mel_lens, \
        p_targets, e_targets, d_targets, \
        p_control, e_control, d_control]
    
    return batch

In [None]:
preprocess_config = "configs/preprocess.yaml"
model_config = "configs/model.yaml"
train_config = "configs/train.yaml"

preprocess_config = yaml.load(open(preprocess_config, "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open(model_config, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(train_config, "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)

In [None]:
# path = "/data/codes/speech-to-text/stt/output/ckpt/100000.pth.tar"
# state_dict = torch.load(path)

# new_state_dict = {
#     "model": dict(),
#     "optimizer": state_dict["optimizer"]
# }
# for key, value in state_dict["model"].items():
#     if key.startswith("module"):
#         key = "module.".join(key.split("module.")[1:])

#     new_state_dict["model"][key] = value

# torch.save(new_state_dict, "/data/codes/speech-to-text/stt/output/ckpt/110000.pth.tar")

In [None]:
class args:
    restore_step = 110000

model = get_model(args(), configs, device="cpu", train=False)
vocoder = get_vocoder(model_config, device="cpu")
lexicon = read_lexicon("data/lexicon")

text = "bị kẻ trộm đi nhà lấy đồ hả"
input_batch = text_to_phonemes(text, lexicon)


In [None]:
output = model(
    input_batch
)

synth_samples(
    input_batch,
    output,
    vocoder,
    model_config,
    preprocess_config,
    train_config["path"]["result_path"],
)

In [1]:
text = " thông báo đấu giá quyền sử dụng đất gồm năm mươi mốt lô đất ở tại khu dân cư thị trấn rừng thông , huyện đông sơn , mặt bằng quy hoạch số bốn nghìn một trăm ba mươi hai , hai nghìn bảy trăm bốn mươi hai , một nghìn tám trăm bảy mươi chín . "

print(len(text.split()))

57
