In [1]:
# !pip install --upgrade pydantic
# !pip install gradio==3.48.0
# !pip install soundfile

In [2]:
# Prediction interface for Cog ⚙️
# https://cog.run/python

from cog import BasePredictor, Path

print("Starting up. Please be patient...")

import datetime
import os
import sys
from typing import Optional
import json
import utils

import torch
import yaml
import soundfile as sf

from common.log import logger
from common.tts_model import ModelHolder
from infer import InvalidToneError
from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize

is_hf_spaces = os.getenv("SYSTEM") == "spaces"
limit = 150

# Get path settings
with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
    path_config: dict[str, str] = yaml.safe_load(f.read())
    # dataset_root = path_config["dataset_root"]
    assets_root = path_config["assets_root"]


def tts_fn(
    model_name,
    model_path,
    text,
    language,
    reference_audio_path,
    sdp_ratio,
    noise_scale,
    noise_scale_w,
    length_scale,
    line_split,
    split_interval,
    assist_text,
    assist_text_weight,
    use_assist_text,
    style,
    style_weight,
    kata_tone_json_str,
    use_tone,
    speaker,
):
    print(f"[!] model_name: {model_name}")
    print(f"[!] model_path: {model_path}")
    print(f"[!] text: {text}")
    print(f"[!] language: {language}")
    print(f"[!] reference_audio_path: {reference_audio_path}")
    print(f"[!] sdp_ratio: {sdp_ratio}")
    print(f"[!] noise_scale: {noise_scale}")
    print(f"[!] noise_scale_w: {noise_scale_w}")
    print(f"[!] length_scale: {length_scale}")
    print(f"[!] line_split: {line_split}")
    print(f"[!] split_interval: {split_interval}")
    print(f"[!] assist_text: {assist_text}")
    print(f"[!] assist_text_weight: {assist_text_weight}")
    print(f"[!] use_assist_text: {use_assist_text}")
    print(f"[!] style: {style}")
    print(f"[!] style_weight: {style_weight}")
    print(f"[!] kata_tone_json_str: {kata_tone_json_str}")
    print(f"[!] use_tone: {use_tone}")
    print(f"[!] speaker: {speaker}")
    if len(text) < 2:
        return "Please enter some text.", None, kata_tone_json_str

    if is_hf_spaces and len(text) > limit:
        return (
            f"Too long! There is a character limit of {limit} characters.",
            None,
            kata_tone_json_str,
        )

    if not model_holder.current_model:
        model_holder.load_model_gr(model_name, model_path)
        logger.info(f"Loaded model '{model_name}'")
    if model_holder.current_model.model_path != model_path:
        model_holder.load_model_gr(model_name, model_path)
        logger.info(f"Swapped to model '{model_name}'")
    speaker_id = model_holder.current_model.spk2id[speaker]
    start_time = datetime.datetime.now()

    wrong_tone_message = ""
    kata_tone: Optional[list[tuple[str, int]]] = None
    if use_tone and kata_tone_json_str != "":
        if language != "JP":
            # logger.warning("Only Japanese is supported for tone generation.")
            wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
        if line_split:
            # logger.warning("Tone generation is not supported for line split.")
            wrong_tone_message = (
                "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
            )
        try:
            kata_tone = []
            json_data = json.loads(kata_tone_json_str)
            # tupleを使うように変換
            for kana, tone in json_data:
                assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
                kata_tone.append((kana, tone))
        except Exception as e:
            logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
            wrong_tone_message = f"アクセント指定が不正です: {e}"
            kata_tone = None

    # toneは実際に音声合成に代入される際のみnot Noneになる
    tone: Optional[list[int]] = None
    if kata_tone is not None:
        phone_tone = kata_tone2phone_tone(kata_tone)
        tone = [t for _, t in phone_tone]

    try:
        sr, audio = model_holder.current_model.infer(
            text=text,
            language=language,
            reference_audio_path=reference_audio_path,
            sdp_ratio=sdp_ratio,
            noise=noise_scale,
            noisew=noise_scale_w,
            length=length_scale,
            line_split=line_split,
            split_interval=split_interval,
            assist_text=assist_text,
            assist_text_weight=assist_text_weight,
            use_assist_text=use_assist_text,
            style=style,
            style_weight=style_weight,
            given_tone=tone,
            sid=speaker_id,
        )
    except InvalidToneError as e:
        logger.error(f"Tone error: {e}")
        return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
    except ValueError as e:
        logger.error(f"Value error: {e}")
        return f"Error: {e}", None, kata_tone_json_str

    end_time = datetime.datetime.now()
    duration = (end_time - start_time).total_seconds()

    if tone is None and language == "JP":
        # アクセント指定に使えるようにアクセント情報を返す
        norm_text = text_normalize(text)
        kata_tone = g2kata_tone(norm_text)
        kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
    elif tone is None:
        kata_tone_json_str = ""

    if reference_audio_path:
        style = "External Audio"
    logger.info(
        f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}"
    )
    message = f"Success, time: {duration} seconds."
    if wrong_tone_message != "":
        message = wrong_tone_message + "\n" + message
    return message, (sr, audio), kata_tone_json_str


def load_voicedata():
    print("Loading voice data...")
    # voices = []
    envoices = []
    jpvoices = []
    styledict = {}
    with open("voicelist.json", "r", encoding="utf-8") as f:
        voc_info = json.load(f)
    for name, info in voc_info.items():
        if not info["enable"]:
            continue
        model_path = info["model_path"]
        voice_name = info["title"]
        speakerid = info["speakerid"]
        datasetauthor = info["datasetauthor"]
        image = info["cover"]
        if not model_path in styledict.keys():
            conf = f"model_assets/{model_path}/config.json"
            hps = utils.get_hparams_from_file(conf)
            s2id = hps.data.style2id
            styledict[model_path] = s2id.keys()
        print(f"Indexed voice {voice_name}")
        if info["primarylang"] == "JP":
            jpvoices.append(
                (name, model_path, voice_name, speakerid, datasetauthor, image)
            )
        else:
            envoices.append(
                (name, model_path, voice_name, speakerid, datasetauthor, image)
            )
    return [envoices, jpvoices], styledict


class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        global model_holder  # Declare model_holder as a global variable

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        model_holder = ModelHolder(assets_root, self.device)

        self.languages = ["EN", "JP", "ZH"]
        self.langnames = ["English", "Japanese"]

        model_names = model_holder.model_names
        if len(model_names) == 0:
            logger.error(f"No models found. Please place the model in {assets_root}.")
            sys.exit(1)
        initial_id = 0
        initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]

        self.voicedata, self.styledict = load_voicedata()

        # Load the initial model
        model_holder.load_model_gr(model_names[initial_id], initial_pth_files[0])
        print(f"Loaded initial model: {model_names[initial_id]}")

        # Verify that the model is loaded
        if model_holder.current_model is None:
            raise RuntimeError("Failed to load the initial model.")
        else:
            print(f"Model loaded successfully: {model_holder.current_model.model_path}")

    def predict(
        self,
        speaker: str = "MoriCalliope",  # Default speaker
        text_input: str = "Hello there! This is test audio of a new Hololive text to speech tool.",
        language: str = "EN",
        reference_audio_path: str = None,
        line_split: bool = True,
        split_interval: float = 0.5,
        style: str = "Neutral",
        style_weight: int = 5,
        use_tone: bool = False,
        sdp_ratio: float = 0.2,
        noise_scale: float = 0.6,
        noise_scale_w: float = 0.8,
        length_scale: int = 1,
        style_text_weight: float = 0.7,
        use_style_text: bool = False,
        style_text: str = "",
    ) -> Path:
        # Find the speaker in voicedata
        for voice_group in self.voicedata:
            for voice in voice_group:
                if voice[3] == speaker:
                    (
                        name,
                        model_path,
                        voice_name,
                        speakerid,
                        datasetauthor,
                        image,
                    ) = voice
                    model_name = model_path  # Correctly infer model_name
                    model_path = f"model_assets/{model_path}/{model_path}.safetensors"
                    break
            else:
                continue
            break
        else:
            raise ValueError(f"Speaker {speaker} not found in voicedata.")

        # Debugging: Print available speakers and the selected speaker
        english_speakers = [
            "MoriCalliope",
            "TakanashiKiara",
            "NinomaeInanis",
            "GawrGura",
            "AmeliaWatson",
            "IRyS",
            "TsukumoSana",
            "CeresFauna",
            "OuroKronii",
            "NanashiMumei",
            "HakosBaelz",
            "ShioriNovella",
            "KosekiBijou",
            "NerissaRavencroft",
            "AiraniIofifteen",
            "KureijiOllie",
            "AnyaMelfissa",
            "VestiaZeta",
        ]
        japanese_speakers = [
            "TokinoSora",
            "HoshimachiSuisei",
            "AZKi",
            "YozoraMel",
            "NatsuiroMatsuri",
            "AkiRosenthal",
            "AkaiHaato",
            "MinatoAqua",
            "NakiriAyame",
            "NekomataOkayu",
            "ShiranuiFlare",
            "ShiroganeNoel",
            "HoushouMarine",
            "TokoyamiTowa",
            "YukihanaLamy",
            "LaplusDarknesss",
            "TakaneLui",
            "HakuiKoyori",
            "SakamataChloe",
            "IchijouRirika",
        ]
        print("Available speakers for English:")
        for speaker in english_speakers:
            print(f" - {speaker}")

        print("\nAvailable speakers for Japanese:")
        for speaker in japanese_speakers:
            print(f" - {speaker}")

        print(f"\nSelected speaker ID: {speakerid}")

        # Ensure the types of the parameters match the expected types in tts_fn
        text_output, (sr, audio), kata_toneas_json_str = tts_fn(
            model_name,
            model_path,
            text_input,
            language,
            reference_audio_path if reference_audio_path else None,
            float(sdp_ratio),
            float(noise_scale),
            float(noise_scale_w),
            float(length_scale),
            bool(line_split),
            float(split_interval),
            style_text,
            float(style_text_weight),
            bool(use_style_text),
            style,
            float(style_weight),
            "",  # kata_tone_json_str
            bool(use_tone),
            speakerid,
        )

        # Check if audio data is valid
        if audio is None or len(audio) == 0:
            raise ValueError("Invalid audio data received from tts_fn")

        # Save the audio output to a file using soundfile
        output_path = "output.wav"
        sf.write(output_path, audio, sr)

        return output_path

Starting up. Please be patient...


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize the predictor
predictor = Predictor()
predictor.setup()

Loading voice data...
Indexed voice Mori Calliope
Indexed voice Takanashi Kiara
Indexed voice Ninomae Ina'nis
Indexed voice Gawr Gura
Indexed voice Amelia Watson
Indexed voice IRyS
Indexed voice Tsukumo Sana
Indexed voice Ceres Fauna
Indexed voice Ouro Kronii
Indexed voice Nanashi Mumei
Indexed voice Hakos Baelz
Indexed voice Shiori Novella
Indexed voice Koseki Bijou
Indexed voice Nerissa Ravencroft
Indexed voice Airani Iofifteen
Indexed voice Kureiji Ollie
Indexed voice Anya Melfissa
Indexed voice Vestia Zeta
Indexed voice Tokino Sora
Indexed voice Hoshimachi Suisei
Indexed voice AZKi
Indexed voice Yozora Mel
Indexed voice Natsuiro Matsuri
Indexed voice Aki Rosenthal
Indexed voice Akai Haato
Indexed voice Minato Aqua
Indexed voice Nakiri Ayame
Indexed voice Nekomata Okayu
Indexed voice Shiranui Flare
Indexed voice Shirogane Noel
Indexed voice Houshou Marine
Indexed voice Tokoyami Towa
Indexed voice Yukihana Lamy
Indexed voice La+ Darknesss
Indexed voice Takane Lui
Indexed voice Hakui 

In [4]:
# Call the predict method with the speaker "MoriCalliope"

output = predictor.predict(
    text_input="Hello there! This is test audio of a new Hololive text to speech tool.",
    line_split=True,
    split_interval=0.5,
    language="EN",
    sdp_ratio=0.2,
    noise_scale=0.6,
    noise_scale_w=0.8,
    length_scale=1.0,
    use_style_text=False,
    style_text="",
    style_text_weight=0.7,
    style="Neutral",
    style_weight=5.0,
    reference_audio_path="",
    use_tone=False,
    speaker="MoriCalliope",  # Use the speakerid for Calli
)

print(f"Output audio saved to: {output}")

Available speakers for English:
 - MoriCalliope
 - TakanashiKiara
 - NinomaeInanis
 - GawrGura
 - AmeliaWatson
 - IRyS
 - TsukumoSana
 - CeresFauna
 - OuroKronii
 - NanashiMumei
 - HakosBaelz
 - ShioriNovella
 - KosekiBijou
 - NerissaRavencroft
 - AiraniIofifteen
 - KureijiOllie
 - AnyaMelfissa
 - VestiaZeta

Available speakers for Japanese:
 - TokinoSora
 - HoshimachiSuisei
 - AZKi
 - YozoraMel
 - NatsuiroMatsuri
 - AkiRosenthal
 - AkaiHaato
 - MinatoAqua
 - NakiriAyame
 - NekomataOkayu
 - ShiranuiFlare
 - ShiroganeNoel
 - HoushouMarine
 - TokoyamiTowa
 - YukihanaLamy
 - LaplusDarknesss
 - TakaneLui
 - HakuiKoyori
 - SakamataChloe
 - IchijouRirika

Selected speaker ID: MoriCalliope
[!] model_name: SBV2_HoloLow
[!] model_path: model_assets/SBV2_HoloLow/SBV2_HoloLow.safetensors
[!] text: Hello there! This is test audio of a new Hololive text to speech tool.
[!] language: EN
[!] reference_audio_path: None
[!] sdp_ratio: 0.2
[!] noise_scale: 0.6
[!] noise_scale_w: 0.8
[!] length_scale: 1.

  return F.conv1d(input, weight, bias, self.stride,


[32m06-03 15:12:27[0m |[1m  INFO  [0m| 2094862345.py:161 | Successful inference, took 6.403847s | MoriCalliope | EN/0.2/0.6/0.8/1.0/Neutral/5.0 | Hello there! This is test audio of a new Hololive text to speech tool.
Output audio saved to: output.wav


In [7]:
# [elem for elem in predictor.voicedata]

predictor.styledict

{'SBV2_HoloLow': dict_keys(['Neutral', 'Kronii', 'NerissaLaugh', 'Calli', 'Nerissa']),
 'SBV2_TakanashiKiara': dict_keys(['Neutral', 'Normal', 'Japanese', 'Happy', 'Reading', 'Excited']),
 'SBV2_HoloHi': dict_keys(['Neutral', 'Fauna', 'Amelia', 'MumeiLaugh', 'Shiori', 'IRyS', 'Ina', 'Gura', 'Mumei', 'ShioriLaugh']),
 'SBV2_HoloAus': dict_keys(['Neutral', 'Sana', 'Baelz1', 'Baelz2', 'BaelzShouting']),
 'SBV2_KosekiBijou': dict_keys(['Neutral', 'Normal', 'Scared', 'Angry', 'Excited']),
 'SBV2_HoloESL': dict_keys(['Neutral', 'Anya', 'IofiLoud', 'Iofi']),
 'SBV2_HoloIDFlu': dict_keys(['Neutral', 'ZetaSoft', 'Zeta', 'ZetaLoud', 'Ollie']),
 'SBV2_HoloJPTest2': dict_keys(['Neutral', 'Koyori', 'Chloe', 'Lamy', 'Aqua', 'Sora', 'Towa', 'Suisei', 'Ayame']),
 'SBV2_HoloJPTest2.5': dict_keys(['Neutral', 'Haato', 'Matsuri', 'Mel', 'Aki', 'Lui', 'AZKi']),
 'SBV2_HoloJPTest': dict_keys(['Neutral', 'Flare', 'Ririka', 'Laplus', 'Noel', 'Okayu', 'Marine'])}