# VibeVoice Audio Creation Notebook

This notebook is designed for creating MP3 audio content using Microsoft's VibeVoice package from a Markdown file or a chapter in an epub file. It is intended to be executed on Google Colab with A100 GPUs.

In [None]:
!pip install ebooklib soundfile torch litellm pydub

In [None]:
!git clone https://github.com/microsoft/VibeVoice.git
!cd VibeVoice/ && pip install -e .

In [None]:
import os
from pathlib import Path

import ebooklib
import soundfile as sf
import torch
from ebooklib import epub
from google.colab import files, drive, userdata
from litellm import completion
from pydub import AudioSegment
from vibevoice.modular.modeling_vibevoice_inference import (
    VibeVoiceForConditionalGenerationInference,
)
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor

**Note**: In case of the `ModuleNotFoundError: No module named 'vibevoice'` error even after a successful installation of the package, the workaround is to click on `Run` in the menu bar and then select `Restart session and run all`.

## Setup Requirements

Before proceeding with audio generation, ensure the following setup is complete:

### 1. Mount Google Drive
Execute the cell below to mount your Google Drive, which will be used for storing outputs and accessing voice samples.

### 2. Voice Samples Directory
Create a directory, say at `/content/drive/MyDrive/VibeVoice/voices/`, and upload the voice sample files found in the [repository](https://github.com/microsoft/VibeVoice/tree/main/demo/voices). These voice samples are required for VibeVoice to generate audio with the specified speaker characteristics.

### 3. Add secrets to Colab
Add the OpenAI API key to Colab by including it as a secret with the name `OPENAI_API_KEY`.

In [None]:
drive.mount("/content/drive")

In [None]:
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

## 1. [Class] Read and Parse Data
- Read Markdown or epub files as input
- Filter and select the desired chapter from an epub file
- Check for the presence of speakers in the text
- If only a single speaker is present, prepend each line with "Speaker 1: " for clarity
- Use GPT-5 Nano to format the text

In [None]:
class VibeVoiceDataParser:
    def __init__(
        self, file_path: str, chapter_id: str | None = None, use_gpt: bool = True
    ) -> None:
        self.file_path = file_path
        self.chapter_id = chapter_id
        self.use_gpt = use_gpt
        self.text = ""
        self.ext = Path(file_path).suffix.lower()

    def read(self) -> str:
        if self.ext == ".md":
            self.text = self._read_md()
        elif self.ext == ".epub" and epub:
            self.text = self._read_epub()
        else:
            raise ValueError("Unsupported file type or missing epublib.")
        return self.text

    def _read_md(self) -> str:
        with open(self.file_path, encoding="utf-8") as f:
            return f.read()

    def _read_epub(self) -> str:
        book = epub.read_epub(self.file_path)
        items = [i for i in book.get_items() if i.get_type() == ebooklib.ITEM_DOCUMENT]
        if self.chapter_id is not None:
            for item in items:
                if getattr(item, "id", None) == self.chapter_id:
                    return item.get_content().decode("utf-8")
            raise ValueError(f"Chapter ID '{self.chapter_id}' not found")
        return "\n".join(i.get_content().decode("utf-8") for i in items)

    def list_chapter_ids(self) -> list[str]:
        """List available chapters in EPUB file"""
        if self.ext != ".epub":
            raise ValueError("Chapter listing is only available for EPUB files")

        book = epub.read_epub(self.file_path)
        items = [i for i in book.get_items() if i.get_type() == ebooklib.ITEM_DOCUMENT]
        ids = [str(getattr(item, "id", f"{i}")) for i, item in enumerate(items)]
        return ids

    def parse(self) -> str:
        if self.use_gpt:
            formatted = self._format_with_gpt(self.text)
            return self._prepend_speaker(formatted)
        else:
            return self._prepend_speaker(self.text)

    def _prepend_speaker(self, txt: str) -> str:
        lines = txt.splitlines()
        out = []
        for line in lines:
            if line.strip() and not line.strip().startswith("Speaker"):
                out.append(f"Speaker 1: {line}")
            else:
                out.append(line)
        return "\n".join(out)

    def _format_with_gpt(self, txt: str) -> str:
        prompt = (
            "You are a text formatting assistant for audio synthesis. "
            "Given the following text, perform these steps: "
            "1. If the text is a Markdown file and contains metadata in YAML front matter (e.g., lines between ---), extract the title and author fields, and replace the metadata block with '<title> by <author>' at the top of the text. "
            "2. If the text contains HTML or XML, convert it to Markdown first."
            "2. Remove all Markdown and HTML formatting (bold, italics, tags, etc). "
            "3. Convert unordered lists (-, *, +) to ordered lists with numbers. "
            "4. Replace Markdown headings (#, ##, etc) with hierarchical numbers (e.g., # becomes 1, ## becomes 1.1), but do not add numbers if already present. "
            "5. Do NOT add numbers to lines that are not part of a list or a heading."
            "6. If the text is from an EPUB and a chapter number is provided, only format that chapter. "
            "7. For any ordered list, including in headings, use the format '1. ', '1.1. ', etc."
            "8. In case of any URLs in the Markdown text, remove the link and only retain the text.\n"
            "Return only the formatted text, no explanations.\n\nText:\n" + txt
        )
        response = completion(
            model="gpt-5-mini",
            messages=[{"role": "user", "content": prompt}],
            reasoning_effort="minimal",
            max_completion_tokens=16384,
        )

        print(f"Cost: ${response._hidden_params['response_cost']}")
        print(f"Input tokens: {response.usage.prompt_tokens}")
        print(f"Output tokens: {response.usage.completion_tokens}")

        formatted = response.choices[0].message.content
        return formatted

## Format Data

In EPUB files, content is divided into separate chunks (cover, preface, introduction, chapters) with unique IDs that may not correspond to chapter names or numbers. Use `parser.list_chapter_ids()` to see available chapters and map IDs to your desired chapter before selection.

In [None]:
file_path = "<path_to_your_epub_file>"
parser = VibeVoiceDataParser(file_path, chapter_id="x01.htm")
# parser.list_chapter_ids()

['titlepage',
 'IFC.htm',
 'title.htm',
 'copyright.htm',
 'dedication.htm',
 'ToC.htm',
 'FM.htm',
 'x01.htm',
 'x02.htm',
 'x03.htm',
 'x04.htm',
 'x05.htm',
 'x06.htm',
 'x07.htm',
 'x08.htm',
 'A.htm',
 'B.htm',
 'C.htm',
 'index.htm',
 'IBC.htm']

In [None]:
parser.read()
parsed_md = parser.parse()



SYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache')['no-cache']: False
Final returned optional params: {'max_completion_tokens': 16384, 'reasoning_effort': 'minimal', 'extra_body': {}}
RAW RESPONSE:
{"id": "chatcmpl-CBog6D8nCXaVCPHltOGpztdeqN4p0", "choices": [{"finish_reason": "stop", "index": 0, "logprobs": null, "message": {"content": "1 Optimizing systems by experiment\n\nThis chapter covers\n\n1. Optimizing an engineered system\n2. Exploring what experiments are\n3. Learning why experiments are uniquely valuable\n\nThe past 20 years have seen a surge in interest in the development of experimental methods used to measure and improve engineered systems, such as web products, automated trading systems, and software infrastructure. Experimental methods have become more automated and more efficient. They have scaled up to large systems like search engines or social media sites. These methods generate continuous, automated performance improvement of live production syst

## 2. [Class] Set Up VibeVoice
- Install and configure the VibeVoice package.
- Prepare the environment for audio generation.

In [None]:
class VibeVoiceAudioGenerator:
    def __init__(
        self,
        voice: str = "Frank",
        model_size: str = "1.5b",
        output_dir: str = "./content/",
        voices_dir: str = "./content/voices/",
    ) -> None:
        self.voice = voice
        self.model_size = model_size
        self.output_dir = output_dir
        self.voices_dir = voices_dir

    def _setup_model(self) -> tuple:
        os.makedirs(self.output_dir, exist_ok=True)
        model_path: str = (
            f"microsoft/VibeVoice-{self.model_size}"
            if self.model_size == "1.5b"
            else "WestZhang/VibeVoice-Large-pt"
        )
        processor = VibeVoiceProcessor.from_pretrained(model_path)
        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
            model_path,
            torch_dtype=torch.float16,
            device_map="auto",
            attn_implementation="flash_attention_2",
            low_cpu_mem_usage=True,
        )
        model.to(torch.device("cuda"))
        model.eval()
        model.set_ddpm_inference_steps(num_steps=10)
        return processor, model

    def _get_voice_path(self) -> str:
        if self.voice in ["Carter", "Frank"]:
            return os.path.join(self.voices_dir, f"en-{self.voice}_man.wav")
        return os.path.join(self.voices_dir, f"en-{self.voice}_woman.wav")

    def _process(self, processor, text: str) -> object:
        voice_path: str = self._get_voice_path()
        return processor(
            text=[text],
            voice_samples=[[voice_path]],
            padding=True,
            return_tensors="pt",
            return_attention_mask=True,
        )

    def _save_audio(self, outputs: object) -> str | None:
        wav_path: str = os.path.join(self.output_dir, "output.wav")
        if outputs.speech_outputs and outputs.speech_outputs[0] is not None:
            sf.write(wav_path, outputs.speech_outputs[0].cpu().numpy(), 24000)
            return wav_path
        return None

    def convert(self, text: str) -> str | None:
        processor, model = self._setup_model()
        inputs = self._process(processor, text)
        with torch.no_grad():
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=None,
                    cfg_scale=1.3,
                    tokenizer=processor.tokenizer,
                    generation_config={"do_sample": False},
                    verbose=False,
                    use_cache=True,
                )
        return self._save_audio(outputs)

## Configure VibeVoice

In [None]:
audio_gen = VibeVoiceAudioGenerator(voices_dir="./content/drive/My Drive/Colab Notebooks/VibeVoice/voices/")
audio_path = audio_gen.convert(parsed_md)

## 3. Generate Audio
- Use VibeVoice to synthesize MP3 audio from the parsed text.
- Save the generated audio files for further use.

In [None]:
mp3_path = audio_path.replace(".wav", ".mp3")
audio = AudioSegment.from_wav(audio_path)
audio.export(mp3_path, format="mp3")
files.download(mp3_path)