From fb25500f4e3e70e5d71462715b83fb3bedcf8bd5 Mon Sep 17 00:00:00 2001 From: 34j <55338215+34j@users.noreply.github.com> Date: Wed, 22 Mar 2023 20:06:01 +0900 Subject: [PATCH] fix: fix RTF calculation (#70) --- src/so_vits_svc_fork/inference/infer_tool.py | 5 +- src/so_vits_svc_fork/inference_main.py | 9 ++- src/so_vits_svc_fork/utils.py | 69 ++++++++++++-------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/src/so_vits_svc_fork/inference/infer_tool.py b/src/so_vits_svc_fork/inference/infer_tool.py index 03a1fd0c..0eb2d1dd 100644 --- a/src/so_vits_svc_fork/inference/infer_tool.py +++ b/src/so_vits_svc_fork/inference/infer_tool.py @@ -209,10 +209,9 @@ def infer( predict_f0=auto_predict_f0, noice_scale=noise_scale, )[0, 0].data.float() - realtime_coef = len(audio) / (t.elapsed * self.target_sample) + audio_duration = audio.shape[-1] / self.target_sample LOG.info( - f"Inferece time: {t.elapsed:.2f}s, Realtime coef: {realtime_coef:.2f} " - f"Input shape: {audio.shape}, Output shape: {audio.shape}" + f"Inferece time: {t.elapsed:.2f}s, RTF: {t.elapsed / audio_duration:.2f}" ) return audio, audio.shape[-1] diff --git a/src/so_vits_svc_fork/inference_main.py b/src/so_vits_svc_fork/inference_main.py index 7b5800e6..fff2e242 100644 --- a/src/so_vits_svc_fork/inference_main.py +++ b/src/so_vits_svc_fork/inference_main.py @@ -161,7 +161,7 @@ def realtime( f"Input Device: {devices[input_device]['name']}, Output Device: {devices[output_device]['name']}" ) - # the model realtime coef is somewhat significantly low only in the first inference + # the model RTL is somewhat significantly high only in the first inference # there could be no better way to warm up the model than to do a dummy inference # (there are not differences in the behavior of the model between the first and the later inferences) # so we do a dummy inference to warm up the model (1 second of audio) @@ -211,7 +211,10 @@ def callback( outdata[:] = (indata + inference) / 2 else: outdata[:] = inference - LOG.info(f"True Realtime coef: {block_seconds / t.elapsed:.2f}") + rtf = t.elapsed / block_seconds + LOG.info(f"Realtime inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") + if rtf > 1: + LOG.warning("RTF is too high, consider increasing block_seconds") with sd.Stream( device=(input_device, output_device), @@ -221,6 +224,6 @@ def callback( blocksize=int(block_seconds * svc_model.target_sample), latency="low", ) as stream: + LOG.info(f"Latency: {stream.latency}") while True: - LOG.info(f"Latency: {stream.latency}") sd.sleep(1000) diff --git a/src/so_vits_svc_fork/utils.py b/src/so_vits_svc_fork/utils.py index b8cff712..6efb8321 100644 --- a/src/so_vits_svc_fork/utils.py +++ b/src/so_vits_svc_fork/utils.py @@ -11,6 +11,7 @@ import requests import torch import torchcrepe +from cm_time import timer from numpy import dtype, float32, ndarray from scipy.io.wavfile import read from torch import FloatTensor, Tensor @@ -245,20 +246,24 @@ def compute_f0( method: Literal["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"] = "crepe", **kwargs, ): - wav_numpy = wav_numpy.astype(np.float32) - wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999) - if method in ["dio", "harvest"]: - return compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method) - elif method == "crepe": - return compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs) - elif method == "crepe-tiny": - return compute_f0_crepe( - wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs - ) - elif method == "parselmouth": - return compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length) - else: - raise ValueError("type must be dio, crepe, harvest or parselmouth") + with timer() as t: + wav_numpy = wav_numpy.astype(np.float32) + wav_numpy /= np.quantile(np.abs(wav_numpy), 0.999) + if method in ["dio", "harvest"]: + f0 = compute_f0_pyworld(wav_numpy, p_len, sampling_rate, hop_length, method) + elif method == "crepe": + f0 = compute_f0_crepe(wav_numpy, p_len, sampling_rate, hop_length, **kwargs) + elif method == "crepe-tiny": + f0 = compute_f0_crepe( + wav_numpy, p_len, sampling_rate, hop_length, model="tiny", **kwargs + ) + elif method == "parselmouth": + f0 = compute_f0_parselmouth(wav_numpy, p_len, sampling_rate, hop_length) + else: + raise ValueError("type must be dio, crepe, harvest or parselmouth") + rtf = t.elapsed / (len(wav_numpy) / sampling_rate) + LOG.info(f"F0 inference time: {t.elapsed:.3f}s, RTF: {rtf:.3f}") + return f0 def f0_to_coarse(f0: torch.Tensor | float): @@ -338,21 +343,27 @@ def get_hubert_model(): def get_hubert_content(hmodel, wav_16k_tensor): - feats = wav_16k_tensor - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - padding_mask = torch.BoolTensor(feats.shape).fill_(False) - inputs = { - "source": feats.to(wav_16k_tensor.device), - "padding_mask": padding_mask.to(wav_16k_tensor.device), - "output_layer": 9, # layer 9 - } - with torch.no_grad(): - logits = hmodel.extract_features(**inputs) - feats = hmodel.final_proj(logits[0]) - return feats.transpose(1, 2) + with timer() as t: + feats = wav_16k_tensor + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.to(wav_16k_tensor.device), + "padding_mask": padding_mask.to(wav_16k_tensor.device), + "output_layer": 9, # layer 9 + } + with torch.no_grad(): + logits = hmodel.extract_features(**inputs) + feats = hmodel.final_proj(logits[0]) + res = feats.transpose(1, 2) + wav_len = wav_16k_tensor.shape[-1] / 16000 + LOG.info( + f"HuBERT inference time : {t.elapsed:.3f}s, RTF: {t.elapsed / wav_len:.3f}" + ) + return res def get_content(cmodel: Any, y: ndarray) -> ndarray: