In [1]:
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
import IPython.display as dp

from pathlib import Path

from typing import Optional
from typing import Optional
from paddle.io import Dataset

import os
import paddle
from paddle import inference

In [2]:
# inference
def get_predictor(
        model_dir: Optional[os.PathLike]=None,
        model_file: Optional[os.PathLike]=None,
        params_file: Optional[os.PathLike]=None,
        device: str='cpu',
        # for gpu
        use_trt: bool=False,
        # for trt
        use_dynamic_shape: bool=True,
        min_subgraph_size: int=5,
        # for cpu
        cpu_threads: int=1,
        use_mkldnn: bool=False,
        # for trt or mkldnn
        precision: int="fp32"):
    """
    Args:
        model_dir (os.PathLike): root path of model.pdmodel and model.pdiparams.
        model_file (os.PathLike): name of model_file.
        params_file (os.PathLike): name of params_file.
        device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu.
        use_trt (bool): whether to use TensorRT or not in GPU.
        use_dynamic_shape (bool): use dynamic shape or not in TensorRT.
        use_mkldnn (bool): whether to use MKLDNN or not in CPU.
        cpu_threads (int): num of thread when use CPU.
        precision (str): mode of running (fp32/fp16/bf16/int8).  
    """
    rerun_flag = False
    if device != "gpu" and use_trt:
        raise ValueError(
            "Predict by TensorRT mode: {}, expect device=='gpu', but device == {}".
            format(precision, device))

    config = inference.Config(
        str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
    config.enable_memory_optim()
    config.switch_ir_optim(True)
    if device == "gpu":
        config.enable_use_gpu(100, 0)
    else:
        config.disable_gpu()
        config.set_cpu_math_library_num_threads(cpu_threads)
        if use_mkldnn:
            # fp32
            config.enable_mkldnn()
            if precision == "int8":
                config.enable_mkldnn_int8({
                    "conv2d_transpose", "conv2d", "depthwise_conv2d", "pool2d",
                    "transpose2", "elementwise_mul"
                })
                # config.enable_mkldnn_int8()
            elif precision in {"fp16", "bf16"}:
                config.enable_mkldnn_bfloat16()
            print("MKLDNN with {}".format(precision))
    if use_trt:
        if precision == "bf16":
            print("paddle trt does not support bf16, switching to fp16.")
            precision = "fp16"
        precision_map = {
            "int8": inference.Config.Precision.Int8,
            "fp32": inference.Config.Precision.Float32,
            "fp16": inference.Config.Precision.Half,
        }
        assert precision in precision_map.keys()
        pdtxt_name = model_file.split(".")[0] + "_" + precision + ".txt"
        if use_dynamic_shape:
            dynamic_shape_file = os.path.join(model_dir, pdtxt_name)
            if os.path.exists(dynamic_shape_file):
                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
                                                           True)
                # for fastspeech2
                config.exp_disable_tensorrt_ops(["reshape2"])
                print("trt set dynamic shape done!")
            else:
                # In order to avoid memory overflow when collecting dynamic shapes, it is changed to use CPU.
                config.disable_gpu()
                config.set_cpu_math_library_num_threads(10)
                config.collect_shape_range_info(dynamic_shape_file)
                print("Start collect dynamic shape...")
                rerun_flag = True

        if not rerun_flag:
            print("Tensor RT with {}".format(precision))
            config.enable_tensorrt_engine(
                workspace_size=1 << 30,
                max_batch_size=1,
                min_subgraph_size=min_subgraph_size,
                precision_mode=precision_map[precision],
                use_static=True,
                use_calib_mode=False, )

    predictor = inference.create_predictor(config)
    return predictor

def get_voc_output(voc_predictor, input):
    voc_input_names = voc_predictor.get_input_names()
    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
    mel_handle.reshape(input.shape)
    mel_handle.copy_from_cpu(input)

    voc_predictor.run()
    voc_output_names = voc_predictor.get_output_names()
    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
    wav = voc_output_handle.copy_to_cpu()
    return wav

## HiFiGAN -> PaddleInference 无法推理，用 run.exe() 验证下量化模型是否有问题

In [4]:
gt_mel_path = 'dump/test/raw/009901_feats.npy'
gt_mel = np.load(gt_mel_path)

### fp32 静态图推理

In [5]:
model_file = 'hifigan_csmsc.pdmodel'
params_file = 'hifigan_csmsc.pdiparams'
model_dir = './hifigan_csmsc_static_1.4.0'

device = 'cpu'
precision = 'fp32'
voc_predictor = get_predictor(
        model_dir=model_dir,
        model_file=model_file,
        params_file=params_file,
        device=device,
        use_mkldnn=True,
        precision=precision
    )
wav = get_voc_output(voc_predictor=voc_predictor, input=gt_mel)

MKLDNN with fp32


In [6]:
wav_path = 'hifigan_fp32_infer.wav'
sf.write(wav_path, wav, samplerate=24000)
dp.Audio(wav_path, rate=24000)

### PaddleInference int8 推理报错 ->❎

In [7]:
model_file = 'hifigan_csmsc.pdmodel'
params_file = 'hifigan_csmsc.pdiparams'
model_dir = './exp/default/inference/hifigan_csmsc_quant'
precision = 'int8'
voc_predictor = get_predictor(
        model_dir=model_dir,
        model_file=model_file,
        params_file=params_file,
        device=device,
        use_mkldnn=True,
        precision=precision
    )
wav = get_voc_output(voc_predictor=voc_predictor, input=gt_mel)

MKLDNN with int8


RuntimeError: (PreconditionNotMet) The number of first scale values must be the same with quant_axis dimension value of Input(X) when the `scale` has only one element, but 512 != 256 here.
  [Hint: Expected scale->numel() == in_tmp.dims()[quant_axis], but received scale->numel():512 != in_tmp.dims()[quant_axis]:256.] (at /paddle/paddle/fluid/operators/quantize_linear_op.h:172)
  [operator < dequantize_linear > error]

### int8 run.exe() 推理 -> 报错，而且和 PaddleInference 报错一样 ->❎

In [8]:
paddle.enable_static()
model_dir = './exp/default/inference/hifigan_csmsc_quant/hifigan_csmsc'
exe = paddle.static.Executor(paddle.CUDAPlace(0))
[mb_melgan_inference, feed_target_names, fetch_targets] = paddle.static.load_inference_model(model_dir, exe)
# 009901
mb_melgan_int8_runexe = exe.run(mb_melgan_inference, feed={feed_target_names[0]:gt_mel},fetch_list=fetch_targets)

RuntimeError: (PreconditionNotMet) The number of first scale values must be the same with quant_axis dimension value of Input(X) when the `scale` has only one element, but 512 != 256 here.
  [Hint: Expected scale->numel() == in_tmp.dims()[quant_axis], but received scale->numel():512 != in_tmp.dims()[quant_axis]:256.] (at /paddle/paddle/fluid/operators/quantize_linear_op.h:172)
  [operator < dequantize_linear > error]