Skip to content

Commit

Permalink
feat(preprocessing): add pre-classify command to manually classify fi…
Browse files Browse the repository at this point in the history
…les (#527)
  • Loading branch information
34j committed Apr 30, 2023
1 parent e76f91f commit 7a0319c
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 2 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ svc infer source.wav
- If your dataset has BGM, please remove the BGM using software such as [Ultimate Vocal Remover](https://ultimatevocalremover.com/). `3_HP-Vocal-UVR.pth` or `UVR-MDX-NET Main` is recommended. [^1]
- If your dataset is a long audio file with a single speaker, use `svc pre-split` to split the dataset into multiple files (using `librosa`).
- If your dataset is a long audio file with multiple speakers, use `svc pre-sd` to split the dataset into multiple files (using `pyannote.audio`). Further manual classification may be necessary due to accuracy issues. If speakers speak with a variety of speech styles, set --min-speakers larger than the actual number of speakers. Due to unresolved dependencies, please install `pyannote.audio` manually: `pip install pyannote-audio`.
- To manually classify audio files, `svc pre-classify` is available. Up and down arrow keys can be used to change the playback speed.

[^1]: https://ytpmv.info/how-to-use-uvr/

Expand Down Expand Up @@ -209,7 +210,8 @@ Options:
Commands:
clean Clean up files, only useful if you are using the default file structure
infer Inference
onnx Export model to onnx
onnx Export model to onnx (currently not working)
pre-classify Classify multiple audio files into multiple files
pre-config Preprocessing part 2: config
pre-hubert Preprocessing part 3: hubert If the HuBERT model is not found, it will be...
pre-resample Preprocessing part 1: resample
Expand Down
41 changes: 40 additions & 1 deletion src/so_vits_svc_fork/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,45 @@ def pre_split(
)


@cli.command()
@click.option(
"-i",
"--input-dir",
type=click.Path(exists=True),
required=True,
help="path to source dir",
)
@click.option(
"-o",
"--output-dir",
type=click.Path(),
default=None,
help="path to output dir",
)
@click.option(
"-c/-nc",
"--create-new/--no-create-new",
type=bool,
default=True,
help="create a new folder for the speaker if not exist",
)
def pre_classify(
input_dir: Path | str,
output_dir: Path | str | None,
create_new: bool,
) -> None:
"""Classify multiple audio files into multiple files"""
from so_vits_svc_fork.preprocessing.preprocess_classify import preprocess_classify

if output_dir is None:
output_dir = input_dir
preprocess_classify(
input_dir=input_dir,
output_dir=output_dir,
create_new=create_new,
)


@cli.command
def clean():
"""Clean up files, only useful if you are using the default file structure"""
Expand Down Expand Up @@ -763,8 +802,8 @@ def clean():
def onnx(
input_path: Path, output_path: Path, config_path: Path, device: torch.device | str
) -> None:
"""Export model to onnx (currently not working)"""
raise NotImplementedError("ONNX export is not yet supported")
"""Export model to onnx"""
input_path = Path(input_path)
if input_path.is_dir():
input_path = list(input_path.glob("*.pth"))[0]
Expand Down
95 changes: 95 additions & 0 deletions src/so_vits_svc_fork/preprocessing/preprocess_classify.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from __future__ import annotations

from logging import getLogger
from pathlib import Path

import keyboard
import librosa
import sounddevice as sd
import soundfile as sf
from rich.console import Console
from tqdm.rich import tqdm

LOG = getLogger(__name__)


def preprocess_classify(
input_dir: Path | str, output_dir: Path | str, create_new: bool = True
) -> None:
# paths
input_dir_ = Path(input_dir)
output_dir_ = Path(output_dir)
speed = 1
if not input_dir_.is_dir():
raise ValueError(f"{input_dir} is not a directory.")
output_dir_.mkdir(exist_ok=True)

console = Console()
# get audio paths and folders
audio_paths = list(input_dir_.glob("*.*"))
last_folders = [x for x in output_dir_.glob("*") if x.is_dir()]
console.print("Press ↑ or ↓ to change speed. Press any other key to classify.")
console.print(f"Folders: {[x.name for x in last_folders]}")

pbar_description = ""

pbar = tqdm(audio_paths)
for audio_path in pbar:
# read file
audio, sr = sf.read(audio_path)

# update description
duration = librosa.get_duration(y=audio, sr=sr)
pbar_description = f"{duration:.1f} {pbar_description}"
pbar.set_description(pbar_description)

while True:
# start playing
sd.play(librosa.effects.time_stretch(audio, rate=speed), sr, loop=True)

# wait for key press
key = str(keyboard.read_key())
if key == "down":
speed /= 1.1
console.print(f"Speed: {speed:.2f}")
elif key == "up":
speed *= 1.1
console.print(f"Speed: {speed:.2f}")
else:
break

# stop playing
sd.stop()

# print if folder changed
folders = [x for x in output_dir_.glob("*") if x.is_dir()]
if folders != last_folders:
console.print(f"Folders updated: {[x.name for x in folders]}")
last_folders = folders

# get folder
folder_candidates = [x for x in folders if x.name.startswith(key)]
if len(folder_candidates) == 0:
if create_new:
folder = output_dir_ / key
else:
console.print(f"No folder starts with {key}.")
continue
else:
if len(folder_candidates) > 1:
LOG.warning(
f"Multiple folders ({[x.name for x in folder_candidates]}) start with {key}. "
f"Using first one ({folder_candidates[0].name})."
)
folder = folder_candidates[0]
folder.mkdir(exist_ok=True)

# move file
new_path = folder / audio_path.name
audio_path.rename(new_path)

# update description
pbar_description = f"Last: {audio_path.name} -> {folder.name}"

# yield result
# yield audio_path, key, folder, new_path

0 comments on commit 7a0319c

Please sign in to comment.