In [None]:
!git clone https://github.com/vibevoice-community/VibeVoice

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip uninstall modelscope

In [None]:
from modelscope import snapshot_download

snapshot_download('microsoft/VibeVoice-1.5B', local_dir='pretrained_models/VibeVoice-1.5B')

# snapshot_download('microsoft/VibeVoice-7B', local_dir='pretrained_models/VibeVoice-7B')

In [None]:
import os
os.getcwd()
os.chdir("/content/VibeVoice")
print(os.getcwd())

In [None]:
!uv pip --quiet install --system -e /content/VibeVoice

In [None]:
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
import torch
from transformers.utils import logging
import re
import os
from pathlib import Path

logging.set_verbosity_info()
logger = logging.get_logger(__name__)

speaker_phi0 = "/content/drive/MyDrive/data_src/clip_0_2.wav"
input_file = "/content/drive/MyDrive/data_src/from_epub.txt"
output_dir = "/content/drive/MyDrive"

model_path = f"{output_dir}/pretrained_models/VibeVoice-1.5B"
# model_path = "pretrained_models/VibeVoice-7B"

max_length_def = 3000
# 保持你的字符替换映射不变
char_rep_map = {
    "：": ",",
    "；": ",",
    ";": ",",
    "，": ",",
    "。": ".",
    "！": "!",
    "？": "?",
    # "\n": " ",
    "·": "-",
    "、": ",",
    "...": "…",
    ",,,": "…",
    "，，，": "…",
    "……": "…",
    "“": "'",
    "”": "'",
    '"': "'",
    "‘": "'",
    "’": "'",
    "（": "'",
    "）": "'",
    "(": "'",
    ")": "'",
    "《": "'",
    "》": "'",
    "【": "'",
    "】": "'",
    "[": "'",
    "]": "'",
    "—": "-",
    "～": "-",
    "~": "-",
    "「": "'",
    "」": "'",
    # ":": ",",
    "〇": "零",
    "○": "零",
}

device="mps" if torch.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"use device[{device}]")

processor = VibeVoiceProcessor.from_pretrained(model_path)
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
        model_path,
        # torch_dtype=torch.bfloat16,
        device_map=device,)
model.eval()

model.set_ddpm_inference_steps(num_steps=10)

def replace_chars(full_script, char_rep_map):
    result = ''
    for char in full_script:
        result += char_rep_map.get(char, char)
    return result

def combine_to_max_length(combined_sentences: list, max_length: int = 400):
    """
    Combines a list of sentences into new strings that do not exceed a maximum length.

    Args:
        combined_sentences: A list of string sentences.
        max_length: The maximum character length for each combined string.

    Returns:
        A list of combined strings.
    """
    if not combined_sentences:
        return []

    result_list = []
    current_string = ""

    for sentence in combined_sentences:
        if len(sentence) > max_length:
            logger.warning(f"Warning: A single sentence exceeds the max_length ({len(sentence)} > {max_length}). It will be added as a separate item.")
            result_list.append(sentence)
            continue

        # Check if adding the new sentence exceeds the max length
        # We add 1 for the space separator
        if len(current_string) + len(sentence) + 1 <= max_length:
            # If the current string is not empty, add a space
            if current_string:
                current_string += "#" + sentence
            else:
                current_string = sentence
        else:
            # If it would exceed, finalize the current string and start a new one
            result_list.append(current_string)
            current_string = sentence

    # Add the last combined string if it's not empty
    if current_string:
        result_list.append(current_string)

    return result_list

def process_line(s_line: str):
    s_line = replace_chars(s_line, char_rep_map)
    # 修正：更新re.split的正则表达式，以包含所有可能的分隔符
    # 你的 char_rep_map 中把"？"和"！"转换成了英文问号和感叹号，所以保留它们
    # 同时，它也把"。"转换成了"."，所以也保留
    # 还需要添加中文的“。！？”，以防转换不完全
    sentences = re.split('([?!.？！])', s_line)

    # re.split 的一种更简洁的替代方案是 re.findall(r'[^?!.]+[?!.]', s_line)
    # 但你目前的代码逻辑是可行的，只是下面需要重新拼接

    sentences_with_punct = [s for s in sentences if s]
    temp_str = ""
    combined_sentences = []
    for s in sentences_with_punct:
        s = s.replace("\n", "")
        if not s:
            continue
        # 你的 char_rep_map 只将部分中文标点转换为英文
        # 修正：这里需要同时检查中文和英文标点
        if s in ['?', '!', '.', '？', '！', '。']:
            temp_str += s
            combined_sentences.append(temp_str)
            temp_str = ""
        else:
            if temp_str:
                combined_sentences.append(temp_str)
            temp_str = s

    if temp_str:
        combined_sentences.append(temp_str)

    return combined_sentences

def gererator_speech(to_tts_txt,
                     voice_samples=[speaker_phi0],
                     output_index=0):



    for _index, _line in enumerate(to_tts_txt):
        _index += 1

        output_path_wav = f"{output_dir}/output/xuese-{output_index}_{_index}.wav"
        output_path_txt = f"{output_dir}/output/xuese-{output_index}_{_index}.txt"

        print(f'processing index[{_index}]')
        if not os.path.exists(output_path_txt):

            output_path_txt = Path(f"{output_dir}/output/xuese-{output_index}_{_index}.txt")
            # Create the parent directory
            output_path_txt.parent.mkdir(parents=True, exist_ok=True)
            # Write to the file
            output_path_txt.write_text(_line.replace("#", ""), encoding='utf-8')

        if not os.path.exists(output_path_wav):
            new_txt_list = ["Speaker 1: " + item for item in _line.split("#")]

            inputs = processor(
                text=["\n".join(new_txt_list)],  # Wrap in list for batch processing
                voice_samples=[voice_samples],  # Wrap in list for batch processing
                padding=True,
                return_tensors="pt",
                return_attention_mask=True,
            )

            outputs = model.generate(
                **inputs,
                max_new_tokens=None,
                cfg_scale=1.3,
                tokenizer=processor.tokenizer,
                # generation_config={'do_sample': True, 'temperature': 0.99, 'top_p': 0.99, 'top_k': 3},
                generation_config={'do_sample': False},
                verbose=True,
                max_length_times=3, #default 2
            )

            processor.save_audio(
                outputs.speech_outputs[0],  # First (and only) batch item
                output_path=output_path_wav,
            )
            print(f'finish process ouput file : {output_path_wav}')

def main():
    input_txt = input_file
    with open(input_txt, 'r') as f:
        lines = f.readlines()
        for line_index, line in enumerate(lines):
          if line and line != '\n':
              to_tts_txt = combine_to_max_length(process_line(line), max_length=max_length_def)
              gererator_speech(to_tts_txt, output_index=line_index)

def test_input_example():
    str_hello = """you text""".replace('\n', '')
    gererator_speech([str_hello], output_index=1)

main()
# test_input_example()

In [None]:
import transformers
print(transformers.__version__)

In [None]:
!python -m vibevoice.finetune.train_vibevoice \
    --model_name_or_path /content/drive/MyDrive/pretrained_models/VibeVoice-1.5B \
    --dataset_name tardigrade-doc/audio_book2 \
    --text_column_name text \
    --audio_column_name audio \
    --voice_prompts_column_name audio \
    --output_dir /content/drive/MyDrive/pretrained_models/VibeVoice-1.5B-ft \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 16 \
    --learning_rate 2.5e-5 \
    --num_train_epochs 1 \
    --logging_steps 10 \
    --save_steps 100 \
    --eval_steps 100 \
    --report_to wandb \
    --remove_unused_columns False \
    --bf16 True \
    --do_train \
    --gradient_clipping \
    --gradient_checkpointing False \
    --ddpm_batch_mul 4 \
    --diffusion_loss_weight 1.4 \
    --train_diffusion_head True \
    --ce_loss_weight 0.04 \
    --voice_prompt_drop_rate 0.2 \
    --lora_target_modules q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.03 \
    --max_grad_norm 0.8

In [22]:
from vibevoice.modular.modeling_vibevoice import VibeVoiceForConditionalGeneration
from peft import PeftModel
import torch

# --- 路径设置 ---
# 基础检查点目录
base_checkpoint_dir = "/content/drive/MyDrive/pretrained_models/VibeVoice-1.5B-ft/checkpoint-79"

# ⭐️ 关键修改：将 lora_dir 指向包含 adapter_config.json 的实际子目录
lora_dir = f"{base_checkpoint_dir}/lora" # LoRA权重目录

output_dir = "/content/drive/MyDrive/VibeVoice-1.5B-ft-merged"
base_model_path = "/content/drive/MyDrive/pretrained_models/VibeVoice-1.5B"

# 1. 加载基础模型 (使用自定义模型类)
print("1. 正在加载 VibeVoice 基础模型...")
# ... (此步骤代码保持不变)
base_model = VibeVoiceForConditionalGeneration.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 2. 加载 LoRA 适配器
print("2. 正在加载 LoRA 适配器...")
# ⭐️ 注意：这里使用修正后的 lora_dir
model = PeftModel.from_pretrained(
    base_model,
    lora_dir, # 现在指向 checkpoint-79/lora/
)

# 3. 合并 LoRA 权重
print("3. 正在合并 LoRA 权重到基础模型...")
merged_model = model.merge_and_unload()

# 4. 保存完整的、合并后的模型
print(f"4. 正在保存完整的合并模型到: {output_dir}")
# 保存模型
merged_model.save_pretrained(output_dir)

# 💡 额外步骤：保存 Processor (如果需要)
# ... (Processor保存代码保持不变)

print(f"✅ 完整合并后的 VibeVoice 模型已成功保存到: {output_dir}")

1. 正在加载 VibeVoice 基础模型...
Tied input and output embeddings using standard assignment.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tied input and output embeddings using standard assignment.
2. 正在加载 LoRA 适配器...




3. 正在合并 LoRA 权重到基础模型...
4. 正在保存完整的合并模型到: /content/drive/MyDrive/VibeVoice-1.5B-ft-merged
✅ 完整合并后的 VibeVoice 模型已成功保存到: /content/drive/MyDrive/VibeVoice-1.5B-ft-merged


In [25]:
from vibevoice.modular.modeling_vibevoice import VibeVoiceForConditionalGeneration
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor # 导入 Processor
from peft import PeftModel
import torch
import os

# --- 路径设置 ---
# 基础检查点目录
base_checkpoint_dir = "/content/drive/MyDrive/pretrained_models/VibeVoice-1.5B-ft/checkpoint-79"

# 核心：LoRA 权重目录（修正后的正确路径）
lora_dir = f"{base_checkpoint_dir}/lora"

# 原始基础模型路径 (用于加载 Processor)
base_model_path = "/content/drive/MyDrive/pretrained_models/VibeVoice-1.5B"

# 合并后模型的输出路径
output_dir = "/content/drive/MyDrive/VibeVoice-1.5B-ft-merged"

# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)


# 1. 加载基础模型 (使用自定义模型类)
print("1. 正在加载 VibeVoice 基础模型...")
base_model = VibeVoiceForConditionalGeneration.from_pretrained(
    base_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# 2. 加载 LoRA 适配器
print("2. 正在加载 LoRA 适配器...")
model = PeftModel.from_pretrained(
    base_model,
    lora_dir,
)

# 3. 合并 LoRA 权重
print("3. 正在合并 LoRA 权重到基础模型...")
merged_model = model.merge_and_unload()

# 4. 保存完整的、合并后的模型权重和配置
print(f"4. 正在保存完整的合并模型权重到: {output_dir}")
merged_model.save_pretrained(output_dir)


# 5. 关键步骤：保存 Processor/Tokenizer
# 这是确保模型可独立推理的关键组件
print("5. 正在保存 VibeVoiceProcessor...")
try:
    # 从原始基础模型路径加载 Processor
    processor = VibeVoiceProcessor.from_pretrained(base_model_path)

    # 将 Processor 保存到合并后的模型路径
    processor.save_pretrained(output_dir)
    print("✅ VibeVoiceProcessor 已成功保存。")
except Exception as e:
    # 如果加载失败，可能会导致模型无法独立进行文本输入处理
    print(f"⚠️ 保存 Processor 失败，请检查 VibeVoiceProcessor 类或路径：{e}")


print("\n--- 任务完成 ---")
print(f"🎉 完整合并后的 VibeVoice 模型已成功保存到: {output_dir}")
print("这个文件夹现在包含模型权重、配置和处理器，可独立用于推理。")

1. 正在加载 VibeVoice 基础模型...
Tied input and output embeddings using standard assignment.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Tied input and output embeddings using standard assignment.
2. 正在加载 LoRA 适配器...
3. 正在合并 LoRA 权重到基础模型...
4. 正在保存完整的合并模型权重到: /content/drive/MyDrive/VibeVoice-1.5B-ft-merged
5. 正在保存 VibeVoiceProcessor...


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Qwen2Tokenizer'. 
The class this function is called from is 'VibeVoiceTextTokenizerFast'.


✅ VibeVoiceProcessor 已成功保存。

--- 任务完成 ---
🎉 完整合并后的 VibeVoice 模型已成功保存到: /content/drive/MyDrive/VibeVoice-1.5B-ft-merged
这个文件夹现在包含模型权重、配置和处理器，可独立用于推理。


In [28]:
!pip install zh_normalization

Collecting zh_normalization
  Downloading zh_normalization-0.0.2.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pypinyin (from zh_normalization)
  Downloading pypinyin-0.55.0-py2.py3-none-any.whl.metadata (12 kB)
Downloading pypinyin-0.55.0-py2.py3-none-any.whl (840 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: zh_normalization
  Building wheel for zh_normalization (setup.py) ... [?25l[?25hdone
  Created wheel for zh_normalization: filename=zh_normalization-0.0.2-py3-none-any.whl size=52243 sha256=53e73ac044dc755ade6bcfdd2ac4120afe47a3f984771834f5f231c8de80f5dd
  Stored in directory: /root/.cac

In [None]:
import string
from typing import Dict
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
import torch
from transformers.utils import logging
import re
import os
from pathlib import Path
from zh_normalization import TextNormalizer


logging.set_verbosity_info()
logger = logging.get_logger(__name__)

class BookAudioGenerator:
    def __init__(self, tts_model, speaker_0, device) -> None:
        self.processor = VibeVoiceProcessor.from_pretrained(
                tts_model,
                # language_model_pretrained_name=llm_model
            )
        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
                tts_model,
                torch_dtype=torch.bfloat16,
                device_map=device,)
        model.eval()
        model.set_ddpm_inference_steps(num_steps=10)
        self.model=model
        self.voice_samples=[speaker_0]
        self.prefix="Speaker 1:"
        self.tn = TextNormalizer()
        self.default_speaker = "旁白"

    def batch_process(self, i_file, batch_size, process_size):

        def _read_file():
            _lines = []
            with open(i_file, 'r', encoding='utf-8') as f:
                _lines = f.read().splitlines()
            if not _lines:
                raise Exception(f'not content in {i_file}')

            results = []  # 存放一批(batch_size)的结果
            current_lines = []  # 当前正在积累的块
            current_length = 0  # 当前块的字符总长度

            for _line in _lines:
                if _line:
                    _line = ''.join(self.tn.normalize(_line))
                    current_lines.append(_line)
                    current_length += len(_line)
                    if current_length >= process_size:
                        results.append(current_lines)
                        current_lines = []
                        current_length = 0
                        if len(results) == batch_size:
                            yield results
                            results = []
            if current_lines:
                results.append(current_lines)
            if results:
                yield results

        batch_index = 0
        for batch in _read_file():
            processed_batch = []
            for sub_list in batch:
                processed_sub_list = []

                for item in sub_list:
                    stripped_item = item.strip()
                    if stripped_item:
                        stripped_item.split()
                        processed_sub_list.extend(self.split_sentence(stripped_item))

                processed_batch.append(processed_sub_list)
            yield processed_batch, batch_index
            batch_index += 1

    def split_sentence(self, sentence):
        pattern = r'([.?!])\s*'
        parts = re.split(pattern, sentence)
        sentences = []
        current_sentence = ""
        for part in parts:
            if part is None or not part.strip():
                continue
            current_sentence += part
            if part in ('.', '?', '!'):
                sentences.append(current_sentence.strip())
                current_sentence = ""
        if current_sentence.strip():
            sentences.append(current_sentence.strip())
        return sentences

    def gererator_speech(
            self,
            chunk,
            batch_index,
            output_dir
            ):

        to_tts_batch = [
            [self.prefix + item for item in row]  # 内部推导式：处理一行中的所有元素
            for row in chunk          # 外部推导式：遍历原始数组的每一行
        ]
        to_tts_batch = ["\n".join(item) for item in to_tts_batch]

        output_path_wav = f"{output_dir}/output/{project_name}-{batch_index}_0.wav"
        if os.path.exists(output_path_wav):
            print(f'⚠️ file {output_path_wav} exists, so batch will not process.')
            return

        inputs = self.processor(
            text=to_tts_batch,  # Wrap in list for batch processing
            voice_samples=[self.voice_samples] * len(chunk),  # Wrap in list for batch processing
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=None,
            cfg_scale=1.3,
            tokenizer=self.processor.tokenizer,
            # generation_config={'do_sample': True, 'temperature': 0.99, 'top_p': 0.99, 'top_k': 3},
            generation_config={'do_sample': False},
            verbose=True,
            max_length_times=2.6, #default 2
        )
        for check in outputs.reach_max_step_sample.tolist():
            if check:
                print(f'⚠️ reach max length, audio may cut up, you may increase [max_length_times]')

        for _index, (output_speech, txt) in enumerate(zip(outputs.speech_outputs, chunk)):

            output_path_wav = f"{output_dir}/output/{project_name}-{batch_index}_{_index}.wav"
            output_path_txt = f"{output_dir}/output/{project_name}-{batch_index}_{_index}.txt"

            output_path = Path(output_path_txt)
            output_path.parent.mkdir(parents=True, exist_ok=True)

            self.processor.save_audio(
                output_speech,
                output_path=output_path_wav,
            )
            output_path.write_text("\n".join(txt), encoding='utf-8')
            print(f'finish process ouput file : {output_path_wav} \n {output_path_txt}')

    def generate(self, to_tts_file, output_dir, batch_size = 2, process_size = 6000):
        for _b, _i in self.batch_process(to_tts_file, batch_size, process_size):
            self.gererator_speech(_b, _i, output_dir)

    def generate_single_dialog(self, to_tts_file, txt_speeker, speeker_voice):

        with open(to_tts_file, 'r', encoding='utf-8') as f:
            _lines = f.read().splitlines()
        output_path_wav = Path(to_tts_file).with_suffix(".wav")

        speeker_voice_x = [f"Speaker {i+1}" for i, speaker in enumerate(txt_speeker)]
        speaker_map: Dict[str, str] = dict(zip(txt_speeker, speeker_voice_x))

        SPEAKER_PATTERN = re.compile(r'^([^:]+):')

        to_tts_batch = []
        pre_speaker = self.default_speaker
        for item in _lines:
            if item:
                match = SPEAKER_PATTERN.match(item)
                if match:
                    speaker_name = match.group(1).strip()
                    selected_prefix = speaker_map.get(speaker_name, self.default_speaker[0])
                    item_content = item[match.end():].strip() # 提取冒号后的内容
                    new_line = selected_prefix + ": " + item_content
                    pre_speaker = speaker_name
                else:
                    speaker_name = pre_speaker
                    new_line = speaker_map.get(speaker_name, self.default_speaker[0]) + ": " + item
                to_tts_batch.append(new_line)

        to_tts_batch = ["\n".join(to_tts_batch)]
        inputs = self.processor(
            text=to_tts_batch,  # Wrap in list for batch processing
            voice_samples=speeker_voice,  # Wrap in list for batch processing
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=None,
            cfg_scale=1.3,
            tokenizer=self.processor.tokenizer,
            generation_config={'do_sample': False},
            verbose=True,
            max_length_times=2.6, #default 2
        )
        self.processor.save_audio(
            outputs.speech_outputs[0],
            output_path=output_path_wav,
        )

    def generate_single(self, to_tts_file):

        with open(to_tts_file, 'r', encoding='utf-8') as f:
            _lines = f.read().splitlines()

        output_path_wav = Path(to_tts_file).with_suffix(".wav")
        to_tts_batch = ["\n".join(self.prefix + item) for item in _lines]
        inputs = self.processor(
            text=to_tts_batch,  # Wrap in list for batch processing
            voice_samples=[self.voice_samples],  # Wrap in list for batch processing
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=None,
            cfg_scale=1.3,
            tokenizer=self.processor.tokenizer,
            generation_config={'do_sample': False},
            verbose=True,
            max_length_times=2.6, #default 2
        )
        self.processor.save_audio(
            outputs.speech_outputs[0],
            output_path=output_path_wav,
        )

local = "colab" # colab modelscope local

drive_dir = None
device = None

match local:
    case "local":
        drive_dir = "/Volumes/sw/MyDrive"
        model_dir = "/Volumes/sw"
        device = "mps"
    case "modelscope":
        drive_dir = "/mnt/workspace"
        model_dir = "/mnt/workspace"
        device = "cuda"
    case "colab":
        device = "cuda"
        model_dir = "/content/drive/MyDrive"
        drive_dir = "/content/drive/MyDrive"

input_file = f"{drive_dir}/data_src/zhengzhi1.txt"
input_file = f"{drive_dir}/data_src/test.txt"
speaker_phi0 = f"{drive_dir}/data_src/sample_zhongdong.wav"

input_file_path = Path(input_file)
project_name = input_file_path.stem

output_dir = f"{drive_dir}/{project_name}"

bookAudioGen = BookAudioGenerator(
    "/content/drive/MyDrive/VibeVoice-1.5B-ft-merged",
    # f"{model_dir}/pretrained_models/Qwen2.5-1.5B",
    speaker_phi0,
    device)
bookAudioGen.generate(input_file, output_dir, 2, 100)

# 针对某个已经经过上述批量处理后,某个txt对应的wav存在问题的重新生成.
# bookAudioGen.generate_single("/Volumes/sw/MyDrive/zhengzhi1/output/zhengzhi1-4_2.txt")

# bookAudioGen.generate_single_dialog(
#     "/Users/larry/github.com/tardigrade-dot/colab-script/data_src/sugeladizhisi_part1.txt",
#     ["旁白", "欧", "苏"],
#     [f"{drive_dir}/data_src/youyi.wav", f"{drive_dir}/data_src/sample_zhongdong.wav", f"{drive_dir}/data_src/gdg_voice_06.wav"])


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer_config.json
loading file chat_template.jinja from cache at None
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load fr

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing VibeVoiceForConditionalGenerationInference.

All the weights of VibeVoiceForConditionalGenerationInference were initialized from the model checkpoint at /content/drive/MyDrive/VibeVoice-1.5B-ft-merged.
If your task is similar to the task the model of the checkpoint was trained on, you can already use VibeVoiceForConditionalGenerationInference for predictions without further training.
Generation config file not found, using a generation config created from the model config.
Generating (active: 1/1):  49%|████▉     | 562/1144 [01:31<01:32,  6.27it/s]