Skip to content

Commit

Permalink
feat(infer): add max_chunk_seconds option (#550)
Browse files Browse the repository at this point in the history
add max_chunk_seconds, option that helps split long chunks to avoid OOM
  • Loading branch information
maximxlss authored May 4, 2023
1 parent f7b0819 commit 101b948
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 1 deletion.
9 changes: 9 additions & 0 deletions src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,13 @@ def train(
default=False,
help="absolute thresh",
)
@click.option(
"-mc",
"--max-chunk-seconds",
type=float,
default=40,
help="maximum allowed single chunk length, set lower if you get out of memory (0 to disable)",
)
def infer(
# paths
input_path: Path,
Expand All @@ -221,6 +228,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
max_chunk_seconds: float = 40,
device: str | torch.device = get_optimal_device(),
):
"""Inference"""
Expand Down Expand Up @@ -264,6 +272,7 @@ def infer(
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
max_chunk_seconds=max_chunk_seconds,
device=device,
)

Expand Down
5 changes: 5 additions & 0 deletions src/so_vits_svc_fork/default_gui_presets.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"pad_seconds": 0.1,
"chunk_seconds": 0.5,
"absolute_thresh": true,
"max_chunk_seconds": 40,
"crossfade_seconds": 0.05,
"block_seconds": 0.35,
"additional_infer_before_seconds": 0.15,
Expand All @@ -27,6 +28,7 @@
"pad_seconds": 0.1,
"chunk_seconds": 0.5,
"absolute_thresh": true,
"max_chunk_seconds": 40,
"crossfade_seconds": 0.05,
"block_seconds": 1.5,
"additional_infer_before_seconds": 0.01,
Expand All @@ -45,6 +47,7 @@
"pad_seconds": 0.1,
"chunk_seconds": 0.5,
"absolute_thresh": true,
"max_chunk_seconds": 40,
"crossfade_seconds": 0.05,
"block_seconds": 2.5,
"additional_infer_before_seconds": 0.01,
Expand All @@ -63,6 +66,7 @@
"pad_seconds": 0.1,
"chunk_seconds": 0.5,
"absolute_thresh": true,
"max_chunk_seconds": 40,
"crossfade_seconds": 0.04,
"block_seconds": 0.15,
"additional_infer_before_seconds": 0.05,
Expand All @@ -81,6 +85,7 @@
"pad_seconds": 0.1,
"chunk_seconds": 0.5,
"absolute_thresh": true,
"max_chunk_seconds": 40,
"auto_play": true,
"passthrough_original": false
}
Expand Down
11 changes: 11 additions & 0 deletions src/so_vits_svc_fork/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,16 @@ def main():
resolution=0.01,
),
],
[
sg.Text("Max chunk seconds (set lower if Out Of Memory, 0 to disable)"),
sg.Push(),
sg.Slider(
range=(0.0, 240.0),
orientation="h",
key="max_chunk_seconds",
resolution=1.0,
),
],
[
sg.Checkbox(
key="absolute_thresh",
Expand Down Expand Up @@ -644,6 +654,7 @@ def apply_preset(name: str) -> None:
pad_seconds=values["pad_seconds"],
chunk_seconds=values["chunk_seconds"],
absolute_thresh=values["absolute_thresh"],
max_chunk_seconds=values["max_chunk_seconds"],
device="cpu"
if not values["use_gpu"]
else get_optimal_device(),
Expand Down
14 changes: 13 additions & 1 deletion src/so_vits_svc_fork/inference/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def split_silence(
frame_length: int = 2048,
hop_length: int = 512,
aggregate: Callable[[ndarray[Any, dtype[float32]]], float] = np.mean,
max_chunk_length: int = 0,
) -> Iterable[Chunk]:
non_silence_indices = librosa.effects.split(
audio,
Expand All @@ -79,7 +80,16 @@ def split_silence(
yield Chunk(
is_speech=False, audio=audio[last_end:start], start=last_end, end=start
)
yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end)
while max_chunk_length > 0 and end - start > max_chunk_length:
yield Chunk(
is_speech=True,
audio=audio[start : start + max_chunk_length],
start=start,
end=start + max_chunk_length,
)
start += max_chunk_length
if end - start > 0:
yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end)
last_end = end
if last_end != len(audio):
yield Chunk(
Expand Down Expand Up @@ -248,6 +258,7 @@ def infer_silence(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
max_chunk_seconds: float = 40,
# fade_seconds: float = 0.0,
) -> np.ndarray[Any, np.dtype[np.float32]]:
sr = self.target_sample
Expand All @@ -267,6 +278,7 @@ def infer_silence(
frame_length=chunk_length_min * 2,
hop_length=chunk_length_min,
ref=1 if absolute_thresh else np.max,
max_chunk_length=int(max_chunk_seconds * sr),
):
LOG.info(f"Chunk: {chunk}")
if not chunk.is_speech:
Expand Down
2 changes: 2 additions & 0 deletions src/so_vits_svc_fork/inference/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def infer(
pad_seconds: float = 0.5,
chunk_seconds: float = 0.5,
absolute_thresh: bool = False,
max_chunk_seconds: float = 40,
device: str | torch.device = get_optimal_device(),
):
model_path = Path(model_path)
Expand Down Expand Up @@ -65,6 +66,7 @@ def infer(
pad_seconds=pad_seconds,
chunk_seconds=chunk_seconds,
absolute_thresh=absolute_thresh,
max_chunk_seconds=max_chunk_seconds,
)

soundfile.write(output_path, audio, svc_model.target_sample)
Expand Down

0 comments on commit 101b948

Please sign in to comment.