feat(infer): add max_chunk_seconds option (#550)

add max_chunk_seconds, option that helps split long chunks to avoid OOM
voicepaw · May 4, 2023 · 101b948 · 101b948
1 parent f7b0819
commit 101b948
Show file tree

Hide file tree

Showing 5 changed files with 40 additions and 1 deletion.
diff --git a/src/so_vits_svc_fork/__main__.py b/src/so_vits_svc_fork/__main__.py
@@ -202,6 +202,13 @@ def train(
     default=False,
     help="absolute thresh",
 )
+@click.option(
+    "-mc",
+    "--max-chunk-seconds",
+    type=float,
+    default=40,
+    help="maximum allowed single chunk length, set lower if you get out of memory (0 to disable)",
+)
 def infer(
     # paths
     input_path: Path,
@@ -221,6 +228,7 @@ def infer(
     pad_seconds: float = 0.5,
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
+    max_chunk_seconds: float = 40,
     device: str | torch.device = get_optimal_device(),
 ):
     """Inference"""
@@ -264,6 +272,7 @@ def infer(
         pad_seconds=pad_seconds,
         chunk_seconds=chunk_seconds,
         absolute_thresh=absolute_thresh,
+        max_chunk_seconds=max_chunk_seconds,
         device=device,
     )
 

diff --git a/src/so_vits_svc_fork/default_gui_presets.json b/src/so_vits_svc_fork/default_gui_presets.json
@@ -9,6 +9,7 @@
     "pad_seconds": 0.1,
     "chunk_seconds": 0.5,
     "absolute_thresh": true,
+    "max_chunk_seconds": 40,
     "crossfade_seconds": 0.05,
     "block_seconds": 0.35,
     "additional_infer_before_seconds": 0.15,
@@ -27,6 +28,7 @@
     "pad_seconds": 0.1,
     "chunk_seconds": 0.5,
     "absolute_thresh": true,
+    "max_chunk_seconds": 40,
     "crossfade_seconds": 0.05,
     "block_seconds": 1.5,
     "additional_infer_before_seconds": 0.01,
@@ -45,6 +47,7 @@
     "pad_seconds": 0.1,
     "chunk_seconds": 0.5,
     "absolute_thresh": true,
+    "max_chunk_seconds": 40,
     "crossfade_seconds": 0.05,
     "block_seconds": 2.5,
     "additional_infer_before_seconds": 0.01,
@@ -63,6 +66,7 @@
     "pad_seconds": 0.1,
     "chunk_seconds": 0.5,
     "absolute_thresh": true,
+    "max_chunk_seconds": 40,
     "crossfade_seconds": 0.04,
     "block_seconds": 0.15,
     "additional_infer_before_seconds": 0.05,
@@ -81,6 +85,7 @@
     "pad_seconds": 0.1,
     "chunk_seconds": 0.5,
     "absolute_thresh": true,
+    "max_chunk_seconds": 40,
     "auto_play": true,
     "passthrough_original": false
   }

diff --git a/src/so_vits_svc_fork/gui.py b/src/so_vits_svc_fork/gui.py
@@ -290,6 +290,16 @@ def main():
                     resolution=0.01,
                 ),
             ],
+            [
+                sg.Text("Max chunk seconds (set lower if Out Of Memory, 0 to disable)"),
+                sg.Push(),
+                sg.Slider(
+                    range=(0.0, 240.0),
+                    orientation="h",
+                    key="max_chunk_seconds",
+                    resolution=1.0,
+                ),
+            ],
             [
                 sg.Checkbox(
                     key="absolute_thresh",
@@ -644,6 +654,7 @@ def apply_preset(name: str) -> None:
                             pad_seconds=values["pad_seconds"],
                             chunk_seconds=values["chunk_seconds"],
                             absolute_thresh=values["absolute_thresh"],
+                            max_chunk_seconds=values["max_chunk_seconds"],
                             device="cpu"
                             if not values["use_gpu"]
                             else get_optimal_device(),

diff --git a/src/so_vits_svc_fork/inference/core.py b/src/so_vits_svc_fork/inference/core.py
@@ -64,6 +64,7 @@ def split_silence(
     frame_length: int = 2048,
     hop_length: int = 512,
     aggregate: Callable[[ndarray[Any, dtype[float32]]], float] = np.mean,
+    max_chunk_length: int = 0,
 ) -> Iterable[Chunk]:
     non_silence_indices = librosa.effects.split(
         audio,
@@ -79,7 +80,16 @@ def split_silence(
             yield Chunk(
                 is_speech=False, audio=audio[last_end:start], start=last_end, end=start
             )
-        yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end)
+        while max_chunk_length > 0 and end - start > max_chunk_length:
+            yield Chunk(
+                is_speech=True,
+                audio=audio[start : start + max_chunk_length],
+                start=start,
+                end=start + max_chunk_length,
+            )
+            start += max_chunk_length
+        if end - start > 0:
+            yield Chunk(is_speech=True, audio=audio[start:end], start=start, end=end)
         last_end = end
     if last_end != len(audio):
         yield Chunk(
@@ -248,6 +258,7 @@ def infer_silence(
         pad_seconds: float = 0.5,
         chunk_seconds: float = 0.5,
         absolute_thresh: bool = False,
+        max_chunk_seconds: float = 40,
         # fade_seconds: float = 0.0,
     ) -> np.ndarray[Any, np.dtype[np.float32]]:
         sr = self.target_sample
@@ -267,6 +278,7 @@ def infer_silence(
             frame_length=chunk_length_min * 2,
             hop_length=chunk_length_min,
             ref=1 if absolute_thresh else np.max,
+            max_chunk_length=int(max_chunk_seconds * sr),
         ):
             LOG.info(f"Chunk: {chunk}")
             if not chunk.is_speech:

diff --git a/src/so_vits_svc_fork/inference/main.py b/src/so_vits_svc_fork/inference/main.py
@@ -36,6 +36,7 @@ def infer(
     pad_seconds: float = 0.5,
     chunk_seconds: float = 0.5,
     absolute_thresh: bool = False,
+    max_chunk_seconds: float = 40,
     device: str | torch.device = get_optimal_device(),
 ):
     model_path = Path(model_path)
@@ -65,6 +66,7 @@ def infer(
         pad_seconds=pad_seconds,
         chunk_seconds=chunk_seconds,
         absolute_thresh=absolute_thresh,
+        max_chunk_seconds=max_chunk_seconds,
     )
 
     soundfile.write(output_path, audio, svc_model.target_sample)