In [36]:
# %matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
import numpy as np
from text.symbols import symbols


# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
from scipy.io.wavfile import write




def chinese_cleaners1(text):
    from pypinyin import Style, pinyin

    phones = [phone[0] for phone in pinyin(text, style=Style.TONE3)]
    return ' '.join(phones)


pinyin2pinlv_dict = np.load(r'D:\project\try\pinyin2pinlv\new_pinyin2pinlv_dict.npy', allow_pickle=True).item()


def to_pinlv(cn_pinying):
    # print(cn_pinying.split())
    cn_pinlv = []
    for i in cn_pinying.split():
        if i in pinyin2pinlv_dict.keys():
            cn_pinlv.append(pinyin2pinlv_dict[i])
        else:
            cn_pinlv.append(i)
    # print(cn_pinlv)
    # print(' '.join(cn_pinlv))
    cn_pinlv = ' '.join(cn_pinlv)
    return cn_pinlv

def txt2seq(text):
    clean_text = chinese_cleaners1(text)
#     clean_text = to_pinlv(clean_text)

    sequence = []
    print(clean_text)
    for symbol in clean_text:
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
    return sequence


def get_text(text, hps):
    # text_norm = text_to_sequence(text, hps.data.text_cleaners)
    text_norm = txt2seq(text)
    print(text_norm)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    print(text_norm)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

hps = utils.get_hparams_from_file("./logs/pinyin/config.json")

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cpu()
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/pinyin/G_116000.pth", net_g, None)

In [37]:
stn_tst = get_text("如果你想拥有一个雾面哑光，如果你想拥有的是一个雾面哑光的，一个底妆呢，就去拍零三色号，它是透明哑光的不会影响我们底妆原本的一个颜色和肤感哦，让我们的底妆更加的清爽云静。", hps)
stn_tst = get_text("所以，所以为了能探索一个既能探索一个既能保持口红，但哑光妆效易涂抹肤感，又能使料体保持一定硬度，可以。微雕花纹的料体配方。", hps)
stn_tst = get_text('他说，哎呀我今天都没化妆，结果你的妆容这么精致美丽，他就说和我一对比，他就感觉自己很邋遢不修边幅的样子。他说，感觉餐厅里的人都在看她，她很不好意思。', hps)
with torch.no_grad():
    x_tst = stn_tst.cpu().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cpu()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
print(audio)
# audio.export('./0.wav', format='wav')
print(type(audio))
print(audio.shape)
print(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
# aa = ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
print(dir(aa))


ru2 guo3 ni3 xiang3 yong1 you3 yi2 ge4 wu4 mian4 ya3 guang1 ， ru2 guo3 ni3 xiang3 yong1 you3 de shi4 yi2 ge4 wu4 mian4 ya3 guang1 de ， yi2 ge4 di3 zhuang1 ne ， jiu4 qu4 pai1 ling2 san1 se4 hao4 ， ta1 shi4 tou4 ming2 ya3 guang1 de bu2 hui4 ying3 xiang3 wo3 men di3 zhuang1 yuan2 ben3 de yi2 ge4 yan2 se4 he2 fu1 gan3 o2 ， rang4 wo3 men de di3 zhuang1 geng4 jia1 de qing1 shuang3 yun2 jing4 。
[57, 60, 68, 13, 46, 60, 54, 69, 13, 53, 48, 69, 13, 63, 48, 40, 53, 46, 69, 13, 64, 54, 53, 46, 67, 13, 64, 54, 60, 69, 13, 64, 48, 68, 13, 46, 44, 70, 13, 62, 60, 70, 13, 52, 48, 40, 53, 70, 13, 64, 40, 69, 13, 46, 60, 40, 53, 46, 67, 13, 3, 13, 57, 60, 68, 13, 46, 60, 54, 69, 13, 53, 48, 69, 13, 63, 48, 40, 53, 46, 69, 13, 64, 54, 53, 46, 67, 13, 64, 54, 60, 69, 13, 43, 44, 13, 58, 47, 48, 70, 13, 64, 48, 68, 13, 46, 44, 70, 13, 62, 60, 70, 13, 52, 48, 40, 53, 70, 13, 64, 40, 69, 13, 46, 60, 40, 53, 46, 67, 13, 43, 44, 13, 3, 13, 64, 48, 68, 13, 46, 44, 70, 13, 43, 48, 69, 13, 65, 47, 60, 40, 53, 46

[-1.0700166e-03 -1.0407178e-03 -9.5184078e-04 ...  2.2152311e-04
  5.5412300e-05  1.6925801e-04]
<class 'numpy.ndarray'>
(314368,)
<IPython.lib.display.Audio object>


['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
