In [1]:
!git clone https://github.com/vibevoice-community/VibeVoice

Cloning into 'VibeVoice'...
remote: Enumerating objects: 639, done.[K
remote: Counting objects: 100% (225/225), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 639 (delta 205), reused 177 (delta 177), pack-reused 414 (from 1)[K
Receiving objects: 100% (639/639), 39.07 MiB | 27.01 MiB/s, done.
Resolving deltas: 100% (385/385), done.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!uv pip --quiet install --system -e /content/VibeVoice

In [4]:
!uv pip install wetext

[2mUsing Python 3.12.12 environment at: /usr[0m
[2K[2mResolved [1m6 packages[0m [2min 217ms[0m[0m
[2K[2mPrepared [1m6 packages[0m [2min 155ms[0m[0m
[2K[2mInstalled [1m6 packages[0m [2min 15ms[0m[0m
 [32m+[39m [1manyascii[0m[2m==0.3.3[0m
 [32m+[39m [1mcontractions[0m[2m==0.1.73[0m
 [32m+[39m [1mkaldifst[0m[2m==1.7.17[0m
 [32m+[39m [1mpyahocorasick[0m[2m==2.2.0[0m
 [32m+[39m [1mtextsearch[0m[2m==0.0.24[0m
 [32m+[39m [1mwetext[0m[2m==0.1.2[0m


In [1]:
import types
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
from transformers.utils import logging
from wetext import Normalizer
from typing import Dict
from pathlib import Path
import torch
import re
import os

logging.set_verbosity_info()
logger = logging.get_logger(__name__)

sentence_splitter = ["！", "；", "？", "～", "?", "!", "：", "～", "…", "……", "。"]
char_rep_map = {
    "——":".", "：": ",","；": ",",";": ",","，": ",","。": ".","！": "!","？": "?","·": "-",
    "、": ",","...": "…",",,,": "…","，，，": "…","……": "…","“": "'","”": "'",
    '"': "'","‘": "'","’": "'","（": "'","）": "'","(": "'",")": "'",
    "《": "'","》": "'","【": "'","】": "'","[": "'","]": "'","—": "-",
    "～": "-","~": "-","「": "'","」": "'",":": ",",
    "〇": "零","○": "零","卐":"万"
}

def replace_chars(full_script, char_rep_map):
    result = ''
    for char in full_script:
        result += char_rep_map.get(char, char)
    return result

class BookAudioGenerator:
    def __init__(self, tts_model, device) -> None:
        self.processor = VibeVoiceProcessor.from_pretrained(
                tts_model,
                device=device
            )
        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
                tts_model,
                torch_dtype=torch.bfloat16,
                device_map=device,)
        model.eval()
        model.set_ddpm_inference_steps(num_steps=10)
        self.model=model

        self.default_prefix="Speaker 1:"
        self.default_speaker = "旁白"
        self.max_length_times = 3
        self.normalizer = Normalizer(lang="zh", operator="tn", remove_erhua=True, traditional_to_simple=False)

    def batch_process(self, i_file, batch_size, process_size):
        def _read_file():
            _lines = []
            with open(i_file, 'r', encoding='utf-8') as f: # 按行分割, 保证在一个段落的内容都放在一个输入中, 避免从中间拆分
                _lines = f.read().splitlines()
            if not _lines:
                raise Exception(f'not content in {i_file}')

            results = []  # 存放一批(batch_size)的结果
            current_lines = []  # 当前正在积累的块
            current_length = 0  # 当前块的字符总长度

            for _line in _lines:
                if _line:
                    current_lines.append(_line)
                    current_length += len(_line)
                    if current_length >= process_size:
                        results.append(current_lines)
                        current_lines = []
                        current_length = 0
                        if len(results) == batch_size:
                            yield results
                            results = []
            if current_lines:
                results.append(current_lines)
            if results:
                yield results

        batch_index = 0
        for batch in _read_file():
            processed_batch = []
            for sub_list in batch:
                processed_sub_list = []

                for item in sub_list:
                    stripped_item = item.strip()
                    if stripped_item:
                        stripped_item.split() # 将一个段落拆分,避免一句话中内容太多,导致输出语音语速变快
                        processed_sub_list.extend(self.split_sentence(stripped_item))

                processed_batch.append(processed_sub_list)
            yield processed_batch, batch_index
            batch_index += 1

    def split_sentence(self, sentence):
        splitters = "".join(sentence_splitter)
        escaped_splitters = re.escape(splitters)
        pattern = r'([' + escaped_splitters + r'])\s*'
        parts = re.split(pattern, sentence)
        sentences = []
        current_sentence = ""
        for part in parts:
            if part is None or not part.strip():
                continue
            current_sentence += part
            if part in sentence_splitter:
                sentences.append(current_sentence.strip())
                current_sentence = ""
        if current_sentence.strip():
            sentences.append(current_sentence.strip())
        return sentences

    def _tts_generate(self, to_tts_batch, voice_sample):
        inputs = self.processor(
            text=to_tts_batch,
            voice_samples=voice_sample,
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=None,
            cfg_scale=1.3,
            tokenizer=self.processor.tokenizer,
            # generation_config={'do_sample': True, 'temperature': 0.99, 'top_p': 0.99, 'top_k': 3},
            generation_config={'do_sample': False},
            verbose=True,
            max_length_times=self.max_length_times, #default 2
        )
        return outputs

    def txt_normlize(self, txt):
        return self.normalizer.normalize(txt)

    def tts_txt_preprocess(self, txt):

        chinese_pattern = r"（.*?）"
        english_pattern = r"\([^)]*?\)"

        combined_pattern = f"{chinese_pattern}|{english_pattern}"
        _txt = re.sub(combined_pattern, "", txt)
        _txt = self.default_prefix + replace_chars(_txt, char_rep_map) #规范化, 替换中文符号, 根据vibevoice文档, 建议使用英语标点符号
        return _txt

    def gererator_speech_with_default_voice(
            self,
            chunk,
            batch_index,
            single_speaker,
            output_dir
            ):
        # txt_normlize 通过opencc将繁体转换成了简体.txt可以直接保存简体部分
        to_tts_batch = [
            [
                self.txt_normlize(item) for item in s_batch
            ]
            for s_batch in chunk
            ]

        _tts_text = [
            "\n".join([
                self.tts_txt_preprocess(item) for item in s_batch
            ])
            for s_batch in to_tts_batch
            ]

        output_stem = output_path_wav = f"{output_dir}/{project_name}-{batch_index}"
        output_path_wav = f"{output_stem}_0.wav"
        if os.path.exists(output_path_wav):
            logger.warning(f'⚠️ file {output_path_wav} exists, so batch will not process.')
            return

        outputs = self._tts_generate(_tts_text, [[single_speaker]] * len(chunk))
        for check in outputs.reach_max_step_sample.tolist():
            if check:
                logger.warning(f'⚠️  reach max length, audio may cut up, you may increase [max_length_times] and current is [{self.max_length_times}]')

        for _index, (output_speech, txt) in enumerate(zip(outputs.speech_outputs, chunk)): #

            output_path_wav = f"{output_stem}_{_index}.wav"
            output_path_txt = f"{output_stem}_{_index}.txt"

            output_path = Path(output_path_txt)
            output_path.parent.mkdir(parents=True, exist_ok=True)

            self.processor.save_audio(
                output_speech,
                output_path=output_path_wav,
            )
            output_path.write_text("\n".join(txt), encoding='utf-8')
            logger.info(f'finish process ouput file : {output_path_wav} \n {output_path_txt}')

    def generate(self, to_tts_file, output_dir, single_speaker, batch_size = 4, process_size = 9000):
        for _b, _i in self.batch_process(to_tts_file, batch_size, process_size):
            self.gererator_speech_with_default_voice(_b, _i, single_speaker, output_dir)

    def generate_single_dialog(self, to_tts_file, txt_speeker, speeker_voice):

        with open(to_tts_file, 'r', encoding='utf-8') as f:
            _lines = f.read().splitlines()
        output_path_wav = Path(to_tts_file).with_suffix(".wav")

        speeker_voice_x = [f"Speaker {i+1}" for i, speaker in enumerate(txt_speeker)]
        speaker_map: Dict[str, str] = dict(zip(txt_speeker, speeker_voice_x))

        SPEAKER_PATTERN = re.compile(r'^([^:]+):')

        to_tts_batch = []
        pre_speaker = self.default_speaker
        for item in _lines:
            if item:
                match = SPEAKER_PATTERN.match(item)
                if match:
                    speaker_name = match.group(1).strip()
                    selected_prefix = speaker_map.get(speaker_name, self.default_speaker[0])
                    item_content = item[match.end():].strip() # 提取冒号后的内容
                    new_line = selected_prefix + ": " + item_content
                    pre_speaker = speaker_name
                else:
                    speaker_name = pre_speaker
                    new_line = speaker_map.get(speaker_name, self.default_speaker[0]) + ": " + item
                to_tts_batch.append(new_line)

        to_tts_batch = ["\n".join(to_tts_batch)]

        outputs = self._tts_generate(to_tts_batch, speeker_voice)
        self.processor.save_audio(
            outputs.speech_outputs[0],
            output_path=output_path_wav,
        )

    def generate_single(self, to_tts_file, voice_samples):
        with open(to_tts_file, 'r', encoding='utf-8') as f:
            _lines = f.read().splitlines()

        output_path_wav = Path(to_tts_file).with_suffix(".wav")
        to_tts_txt = [self.default_prefix + item for item in _lines]
        to_tts_txt = "\n".join(to_tts_txt)
        to_tts_txt = self.txt_normlize(to_tts_txt)
        to_tts_txt = self.tts_txt_preprocess(to_tts_txt)
        outputs = self._tts_generate(to_tts_txt, [voice_samples])
        self.processor.save_audio(
            outputs.speech_outputs[0],
            output_path=output_path_wav,
        )

env_type = "colab" # colab modelscope local

env_config = {
    "local":{
        "drive_dir" : "/Volumes/sw/MyDrive",
        # "model_name": "/Volumes/sw/pretrained_models/VibeVoice-1.5B",
        "model_name": "/Volumes/sw/hf_models/VibeVoice-1.5B-ft",
        "device": "mps"
    },
    "modelscope":{
        "drive_dir": "/mnt/workspace",
        # model_name = "/mnt/workspace/pretrained_models/VibeVoice-1.5B"
        "model_name": "/mnt/workspace/pretrained_models/VibeVoice-1.5B-ft",
        "device": "cuda"
    },
    "colab":{
        "drive_dir": "/content/drive/MyDrive",
        "model_name": "tardigrade-doc/VibeVoice-1.5B-ft",
        # "model_name": "microsoft/VibeVoice-1.5B",
        "device": "cuda"
    }
}
if env_type not in env_config:
    raise Exception(f"not supported env {env_type}")
config_dict = env_config[env_type]

config = types.SimpleNamespace(**config_dict)

drive_dir = config.drive_dir
model_name = config.model_name
device = config.device

input_file = f"{drive_dir}/data_src/tianchaoyaoyuan2.txt"
speaker_phi0 = f"{drive_dir}/data_src/qinsheng.wav"

input_file_path = Path(input_file)
project_name = input_file_path.stem

output_dir = f"{drive_dir}/{project_name}"
bookAudioGen = BookAudioGenerator(
    model_name,
    device)

bookAudioGen.generate(input_file, output_dir, speaker_phi0, 4, 8000)
# bookAudioGen.generate_single("/content/drive/MyDrive/fubaiyufanfu/fubaiyufanfu-1_0.txt", speaker_phi0)
# 针对某个已经经过上述批量处理后,某个txt对应的wav存在问题的重新生成.
# bookAudioGen.generate_single("/Volumes/sw/MyDrive/zhengzhi1/output/zhengzhi1-4_2.txt", [speaker_phi0])

# bookAudioGen.generate_single("/Volumes/sw/tmp/zhengzhi1-5_4.txt", [speaker_phi0])

# bookAudioGen.generate_single_dialog(
#     "/Users/larry/github.com/tardigrade-dot/colab-script/data_src/sugeladizhisi_part1.txt",
#     ["旁白", "欧", "苏"],
#     [f"{drive_dir}/data_src/youyi.wav", f"{drive_dir}/data_src/sample_zhongdong.wav", f"{drive_dir}/data_src/gdg_voice_06.wav"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer_config.json
loading file chat_template.jinja from cache at None
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load fr

config.json: 0.00B [00:00, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/snapshots/1904eae38036e9c780d28e27990c27748984eafe/config.json
Model config VibeVoiceConfig {
  "acoustic_tokenizer_config": {
    "causal": true,
    "channels": 1,
    "conv_bias": true,
    "conv_norm": "none",
    "corpus_normalize": 0.0,
    "decoder_depths": null,
    "decoder_n_filters": 32,
    "decoder_ratios": [
      8,
      5,
      5,
      4,
      2,
      2
    ],
    "disable_last_norm": true,
    "encoder_depths": "3-3-3-3-3-3-8",
    "encoder_n_filters": 32,
    "encoder_ratios": [
      8,
      5,
      5,
      4,
      2,
      2
    ],
    "fix_std": 0.5,
    "layer_scale_init_value": 1e-06,
    "layernorm": "RMSNorm",
    "layernorm_elementwise_affine": true,
    "layernorm_eps": 1e-05,
    "mixer_layer": "depthwise_conv",
    "model_type": "vibevoice_acoustic_tokenizer",
    "pad_mode": "constant",
    "std_dist_type": "gaussian",
    "vae_dim":

model.safetensors.index.json: 0.00B [00:00, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--VibeVoice-1.5B/snapshots/1904eae38036e9c780d28e27990c27748984eafe/model.safetensors.index.json


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Instantiating VibeVoiceForConditionalGenerationInference model under default dtype torch.bfloat16.
Generate config GenerationConfig {}

Instantiating Qwen2Model model under default dtype torch.bfloat16.
Instantiating VibeVoiceAcousticTokenizerModel model under default dtype torch.bfloat16.
Instantiating VibeVoiceSemanticTokenizerModel model under default dtype torch.bfloat16.
Instantiating VibeVoiceDiffusionHead model under default dtype torch.bfloat16.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing VibeVoiceForConditionalGenerationInference.

All the weights of VibeVoiceForConditionalGenerationInference were initialized from the model checkpoint at microsoft/VibeVoice-1.5B.
If your task is similar to the task the model of the checkpoint was trained on, you can already use VibeVoiceForConditionalGenerationInference for predictions without further training.
Generation config file not found, using a generation config created from the model config.
Generating (active: 2/2):  68%|██████▊   | 13156/19425 [43:19<25:40,  4.07it/s]

Samples [0] reached EOS token at step 13157.


Generating (active: 1/2):  75%|███████▍  | 14495/19425 [48:51<17:10,  4.78it/s]

Samples [1] reached EOS token at step 14496.


Generating (active: 2/2):  65%|██████▌   | 13501/20673 [45:05<30:15,  3.95it/s]

Samples [1] reached EOS token at step 13502.


Generating (active: 1/2):  71%|███████▏  | 14775/20673 [50:21<17:01,  5.78it/s]

Samples [0] reached EOS token at step 14776.


Generating (active: 2/2):  72%|███████▏  | 14374/19899 [48:26<23:35,  3.90it/s]

Samples [0] reached EOS token at step 14375.


Generating (active: 1/2):  78%|███████▊  | 15606/19899 [53:45<15:33,  4.60it/s]

Samples [1] reached EOS token at step 15607.


Generating (active: 2/2):  41%|████      | 7477/18435 [22:12<36:33,  5.00it/s]

Samples [1] reached EOS token at step 7478.


Generating (active: 1/2):  60%|██████    | 11139/18435 [34:56<27:36,  4.40it/s]

Samples [0] reached EOS token at step 11141.


Generating (active: 2/2):  61%|██████▏   | 12394/20205 [40:34<31:25,  4.14it/s]

Samples [0] reached EOS token at step 12395.


Generating (active: 1/2):  69%|██████▊   | 13875/20205 [46:30<26:25,  3.99it/s]

Samples [1] reached EOS token at step 13877.


Generating (active: 2/2):  69%|██████▉   | 14867/21588 [51:22<29:43,  3.77it/s]

Samples [1] reached EOS token at step 14868.


Generating (active: 1/2):  74%|███████▎  | 15871/21588 [55:46<21:10,  4.50it/s]

Samples [0] reached EOS token at step 15872.


Generating (active: 2/2):  75%|███████▍  | 15341/20532 [53:11<22:53,  3.78it/s]

Samples [0] reached EOS token at step 15342.


Generating (active: 1/2):  77%|███████▋  | 15721/20532 [54:51<17:35,  4.56it/s]

Samples [1] reached EOS token at step 15722.


Generating (active: 2/2):  71%|███████   | 14553/20508 [49:43<25:47,  3.85it/s]

Samples [1] reached EOS token at step 14554.


Generating (active: 1/2):  71%|███████   | 14611/20508 [49:58<20:47,  4.73it/s]

Samples [0] reached EOS token at step 14612.


Generating (active: 2/2):  80%|███████▉  | 15269/19170 [51:58<17:04,  3.81it/s]

Samples [1] reached EOS token at step 15270.


Generating (active: 1/2):  82%|████████▏ | 15804/19170 [54:16<12:04,  4.64it/s]

Samples [0] reached EOS token at step 15805.


Generating (active: 2/2):  66%|██████▋   | 13855/20895 [46:21<29:43,  3.95it/s]

Samples [1] reached EOS token at step 13856.


Generating (active: 1/2):  69%|██████▉   | 14417/20895 [48:41<22:38,  4.77it/s]

Samples [0] reached EOS token at step 14418.


Generating (active: 2/2):  68%|██████▊   | 13583/19953 [45:19<27:00,  3.93it/s]

Samples [1] reached EOS token at step 13584.


Generating (active: 1/2):  79%|███████▉  | 15753/19953 [54:39<15:22,  4.55it/s]

Samples [0] reached EOS token at step 15754.


Generating (active: 2/2):  72%|███████▏  | 14111/19734 [48:11<24:19,  3.85it/s]

Samples [1] reached EOS token at step 14112.


Generating (active: 1/2):  79%|███████▉  | 15612/19734 [54:38<15:08,  4.54it/s]

Samples [0] reached EOS token at step 15613.


Generating (active: 2/2):  73%|███████▎  | 14440/19674 [50:00<22:55,  3.81it/s]

Samples [1] reached EOS token at step 14441.


Generating (active: 1/2):  74%|███████▍  | 14513/19674 [50:19<18:28,  4.66it/s]

Samples [0] reached EOS token at step 14514.


Generating (active: 2/2):  32%|███▏      | 6139/19056 [18:40<42:28,  5.07it/s]

Samples [1] reached EOS token at step 6140.


Generating (active: 1/2):  86%|████████▌ | 16297/19056 [57:35<10:17,  4.46it/s]

Samples [0] reached EOS token at step 16298.


