In [4]:
# %matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
from text.symbols import symbols


# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
from scipy.io.wavfile import write




def chinese_cleaners1(text):
    from pypinyin import Style, pinyin

    phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
    return ' '.join(phones)


pinyin2pinlv_dict = np.load(r'D:\project\try\pinyin2pinlv\pinyin2pinlv_dict.npy', allow_pickle=True).item()


def to_pinlv(cn_pinying):
    # print(cn_pinying.split())
    cn_pinlv = []
    for i in cn_pinying.split():
        if i in pinyin2pinlv_dict.keys():
            cn_pinlv.append(pinyin2pinlv_dict[i])
        else:
            cn_pinlv.append(i)
    # print(cn_pinlv)
    # print(' '.join(cn_pinlv))
    cn_pinlv = ' '.join(cn_pinlv)
    return cn_pinlv

def txt2seq(text):
    clean_text = chinese_cleaners1(text)
#     clean_text = to_pinlv(clean_text)

    sequence = []

    for symbol in clean_text:
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
    return sequence


def get_text(text, hps):
    # text_norm = text_to_sequence(text, hps.data.text_cleaners)
    text_norm = txt2seq(text)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    print(text_norm)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("./logs/config.json")

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cpu()
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/G_10000.pth", net_g, None)

# # stn_tst = get_text("VITS is Awesome!", hps)
# stn_tst = get_text("我是中国人，微软小冰在苏州，我在做变声器项目。", hps)

In [6]:
stn_tst = get_text("王总，昨晚您好厉害哦！今天来个什么项目呀？", hps)
with torch.no_grad():
    x_tst = stn_tst.cpu().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
print(audio)
# audio.export('./0.wav', format='wav')
print(type(audio))
print(audio.shape)
print(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
aa = ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
print(dir(aa))

[0, 62, 0, 40, 0, 53, 0, 46, 0, 67, 0, 13, 0, 65, 0, 54, 0, 53, 0, 46, 0, 68, 0, 13, 0, 3, 0, 13, 0, 65, 0, 60, 0, 54, 0, 67, 0, 13, 0, 62, 0, 40, 0, 53, 0, 68, 0, 13, 0, 53, 0, 48, 0, 53, 0, 67, 0, 13, 0, 47, 0, 40, 0, 54, 0, 68, 0, 13, 0, 51, 0, 48, 0, 69, 0, 13, 0, 47, 0, 40, 0, 48, 0, 69, 0, 13, 0, 54, 0, 67, 0, 13, 0, 5, 0, 13, 0, 49, 0, 48, 0, 53, 0, 66, 0, 13, 0, 59, 0, 48, 0, 40, 0, 53, 0, 66, 0, 13, 0, 51, 0, 40, 0, 48, 0, 67, 0, 13, 0, 46, 0, 44, 0, 69, 0, 13, 0, 58, 0, 47, 0, 44, 0, 53, 0, 67, 0, 13, 0, 52, 0, 44, 0, 13, 0, 63, 0, 48, 0, 40, 0, 53, 0, 46, 0, 69, 0, 13, 0, 52, 0, 60, 0, 69, 0, 13, 0, 64, 0, 40, 0, 13, 0, 6, 0]
[-5.42710586e-05 -2.88438987e-05 -2.45090225e-04 ...  1.07074586e-04
  2.55662337e-04  3.03065986e-04]
<class 'numpy.ndarray'>
(84992,)
<IPython.lib.display.Audio object>


['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
