In [None]:
!git clone https://github.com/vibevoice-community/VibeVoice

In [None]:
!pip uninstall modelscope

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from modelscope import snapshot_download

snapshot_download('microsoft/VibeVoice-1.5B', local_dir='pretrained_models/VibeVoice-1.5B')

# snapshot_download('microsoft/VibeVoice-7B', local_dir='pretrained_models/VibeVoice-7B')

In [None]:
import os
os.getcwd()
os.chdir("/content/VibeVoice")
print(os.getcwd())

In [None]:
!uv pip --quiet install --system -e /content/VibeVoice

In [None]:
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
import torch
from transformers.utils import logging
import re
import os
from pathlib import Path

logging.set_verbosity_info()
logger = logging.get_logger(__name__)

speaker_phi0 = "/content/drive/MyDrive/data_src/clip_0_2.WAV"
input_file = "/content/drive/MyDrive/data_src/from_epub.txt"
output_dir = "/content/drive/MyDrive"

model_path = f"{output_dir}/pretrained_models/VibeVoice-1.5B"
# model_path = "pretrained_models/VibeVoice-7B"

max_length_def = 3000
# 保持你的字符替换映射不变
char_rep_map = {
    "：": ",",
    "；": ",",
    ";": ",",
    "，": ",",
    "。": ".",
    "！": "!",
    "？": "?",
    # "\n": " ",
    "·": "-",
    "、": ",",
    "...": "…",
    ",,,": "…",
    "，，，": "…",
    "……": "…",
    "“": "'",
    "”": "'",
    '"': "'",
    "‘": "'",
    "’": "'",
    "（": "'",
    "）": "'",
    "(": "'",
    ")": "'",
    "《": "'",
    "》": "'",
    "【": "'",
    "】": "'",
    "[": "'",
    "]": "'",
    "—": "-",
    "～": "-",
    "~": "-",
    "「": "'",
    "」": "'",
    # ":": ",",
    "〇": "零",
    "○": "零",
}

def replace_chars(full_script, char_rep_map):
    result = ''
    for char in full_script:
        result += char_rep_map.get(char, char)
    return result

def combine_to_max_length(combined_sentences: list, max_length: int = 400):
    """
    Combines a list of sentences into new strings that do not exceed a maximum length.

    Args:
        combined_sentences: A list of string sentences.
        max_length: The maximum character length for each combined string.

    Returns:
        A list of combined strings.
    """
    if not combined_sentences:
        return []

    result_list = []
    current_string = ""

    for sentence in combined_sentences:
        if len(sentence) > max_length:
            logger.warning(f"Warning: A single sentence exceeds the max_length ({len(sentence)} > {max_length}). It will be added as a separate item.")
            result_list.append(sentence)
            continue

        # Check if adding the new sentence exceeds the max length
        # We add 1 for the space separator
        if len(current_string) + len(sentence) + 1 <= max_length:
            # If the current string is not empty, add a space
            if current_string:
                current_string += "#" + sentence
            else:
                current_string = sentence
        else:
            # If it would exceed, finalize the current string and start a new one
            result_list.append(current_string)
            current_string = sentence

    # Add the last combined string if it's not empty
    if current_string:
        result_list.append(current_string)

    return result_list

def process_line(s_line: str):
    s_line = replace_chars(s_line, char_rep_map)
    # 修正：更新re.split的正则表达式，以包含所有可能的分隔符
    # 你的 char_rep_map 中把"？"和"！"转换成了英文问号和感叹号，所以保留它们
    # 同时，它也把"。"转换成了"."，所以也保留
    # 还需要添加中文的“。！？”，以防转换不完全
    sentences = re.split('([?!.？！])', s_line)

    # re.split 的一种更简洁的替代方案是 re.findall(r'[^?!.]+[?!.]', s_line)
    # 但你目前的代码逻辑是可行的，只是下面需要重新拼接

    sentences_with_punct = [s for s in sentences if s]
    temp_str = ""
    combined_sentences = []
    for s in sentences_with_punct:
        s = s.replace("\n", "")
        if not s:
            continue
        # 你的 char_rep_map 只将部分中文标点转换为英文
        # 修正：这里需要同时检查中文和英文标点
        if s in ['?', '!', '.', '？', '！', '。']:
            temp_str += s
            combined_sentences.append(temp_str)
            temp_str = ""
        else:
            if temp_str:
                combined_sentences.append(temp_str)
            temp_str = s

    if temp_str:
        combined_sentences.append(temp_str)

    return combined_sentences

def gererator_speech(to_tts_txt,
                     voice_samples=[speaker_phi0],
                     output_index=0):


    device="mps" if torch.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
    print(f"use device[{device}]")

    processor = VibeVoiceProcessor.from_pretrained(model_path)
    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
            model_path,
            # torch_dtype=torch.bfloat16,
            device_map=device,)
    model.eval()

    model.set_ddpm_inference_steps(num_steps=10)
    for _index, _line in enumerate(to_tts_txt):
        _index += 1

        output_path_wav = f"{output_dir}/output/xuese-{output_index}_{_index}.wav"
        output_path_txt = f"{output_dir}/output/xuese-{output_index}_{_index}.txt"

        print(f'processing index[{_index}]')
        if not os.path.exists(output_path_txt):

            output_path_txt = Path(f"{output_dir}/output/1p_tiyan-{output_index}_{_index}.txt")
            # Create the parent directory
            output_path_txt.parent.mkdir(parents=True, exist_ok=True)
            # Write to the file
            output_path_txt.write_text(_line.replace("#", ""), encoding='utf-8')

        if not os.path.exists(output_path_wav):
            new_txt_list = ["Speaker 1: " + item for item in _line.split("#")]

            inputs = processor(
                text=["\n".join(new_txt_list)],  # Wrap in list for batch processing
                voice_samples=[voice_samples],  # Wrap in list for batch processing
                padding=True,
                return_tensors="pt",
                return_attention_mask=True,
            )

            outputs = model.generate(
                **inputs,
                max_new_tokens=None,
                cfg_scale=1.3,
                tokenizer=processor.tokenizer,
                # generation_config={'do_sample': True, 'temperature': 0.99, 'top_p': 0.99, 'top_k': 3},
                generation_config={'do_sample': False},
                verbose=True,
                max_length_times=3, #default 2
            )

            processor.save_audio(
                outputs.speech_outputs[0],  # First (and only) batch item
                output_path=output_path_wav,
            )
            print(f'finish process ouput file : {output_path_wav}')

def main():
    input_txt = input_file
    line_index = 0
    with open(input_txt, 'r') as f:
        line = f.readline()
        if line:
            to_tts_txt = combine_to_max_length(process_line(line), max_length=max_length_def)
            gererator_speech(to_tts_txt, output_index=line_index)
        line_index = line_index + 1


def test_input_example():
    str_hello = """you text""".replace('\n', '')
    gererator_speech([str_hello], output_index=1)

main()
# test_input_example()



use device[cuda]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loadin

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [1]:
import transformers
print(transformers.__version__)

4.51.3
