## Select T4 GPU from Runtime

# Podcast generator

In [2]:
# Install required packages
!pip install 'sphn<0.2'
!pip install --no-deps "moshi==0.2.7"
!pip install urllib3 soundfile numpy torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
# Import required libraries
import urllib.request
import os
import numpy as np
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel
from IPython.display import display, Audio
import soundfile as sf

In [4]:
# Download the Transformer paper
paper_url = "https://arxiv.org/pdf/1706.03762.pdf"
pdf_path = "attention_is_all_you_need.pdf"
if not os.path.exists(pdf_path):
    print("Downloading paper...")
    urllib.request.urlretrieve(paper_url, pdf_path)
print(f"Paper available at: {pdf_path}")

Downloading paper...
Paper available at: attention_is_all_you_need.pdf


In [5]:
# Create dialog text
dialog = [
    "Let me tell you about an amazing paper that revolutionized machine learning - 'Attention Is All You Need'.",
    "Oh wow, that sounds intriguing! What makes it so special?",
    "Well, imagine traditional neural networks as a game of telephone, where information gets distorted as it passes through. The transformer architecture they introduced is more like having a direct conversation with everyone at once!",
    "That's fascinating! So it can pay attention to everything simultaneously?",
    "Exactly! And this was a game-changer. Before transformers, we were stuck with sequential processing using RNNs and LSTMs. The transformer's parallel processing and self-attention mechanism made everything faster and better."
]

In [6]:
# Setup TTS model
print(f"Available voices at: https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO}")

checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tts_model = TTSModel.from_checkpoint_info(
    checkpoint_info,
    n_q=32,
    temp=0.6,
    device=device
)

Available voices at: https://huggingface.co/kyutai/tts-voices
Using device: cuda


In [25]:
# Print repository URL for reference
print(f"Available voices at: https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO}")

# Define a dictionary with our intended voices (omitting French/cml voices)
available_voices = {
    "expresso": {
        "happy": "expresso/ex03-ex01_happy_001_channel1_334s.wav",
        "narration": "expresso/ex03-ex02_narration_001_channel1_674s.wav"
    },
    "vctk": {
        "p226": "vctk/p226_023_mic1.wav",  # Male speaker (if available)
        "p225": "vctk/p225_023_mic1.wav"   # Female speaker (if available)
    }
}

# Helper function to verify if a voice path is available
def verify_voice(voice_path):
    try:
        _ = tts_model.get_voice_path(voice_path)
        return True
    except Exception as e:
        print(f"Voice not found: {voice_path}")
        return False

# Collect working voices from our dictionary (only expresso and vctk)
working_voices = {}
for collection, voices in available_voices.items():
    working_voices[collection] = {}
    print(f"\nTesting {collection} voices:")
    for name, path in voices.items():
        if verify_voice(path):
            working_voices[collection][name] = path
            print(f"✅ {name}: {path}")
        else:
            print(f"❌ {name}: {path}")

# Choose voices:
# Prioritize using VCTK if at least two voices are available; otherwise, use Expresso.
if working_voices.get("vctk", {}) and len(working_voices["vctk"]) >= 2:
    voice1 = working_voices["vctk"]["p226"]
    voice2 = working_voices["vctk"]["p225"]
elif working_voices.get("expresso", {}) and len(working_voices["expresso"]) >= 2:
    voice1 = working_voices["expresso"]["happy"]
    # Use laughing if available; otherwise, fall back to happy
    voice2 = working_voices["expresso"].get("narration", working_voices["expresso"]["happy"])
else:
    # Fallback: use the expresso happy voice for both speakers
    voice1 = voice2 = available_voices["expresso"]["happy"]

# Get voice paths (they should be verified to exist)
voice1_path = tts_model.get_voice_path(voice1)
voice2_path = tts_model.get_voice_path(voice2)

# Prepare dialog (assumes 'dialog' is defined elsewhere in your notebook)
entries = tts_model.prepare_script(dialog, padding_between=1)

# Configure voice conditions with a CFG coefficient (adjustable)
condition_attributes = tts_model.make_condition_attributes(
    [voice1_path, voice2_path],
    cfg_coef=2.0
)

# Print selected voices for final verification
print(f"\nSelected voices:")
print(f"Speaker 1: {voice1}")
print(f"Speaker 2: {voice2}")

Available voices at: https://huggingface.co/kyutai/tts-voices

Testing expresso voices:
✅ happy: expresso/ex03-ex01_happy_001_channel1_334s.wav
✅ narration: expresso/ex03-ex02_narration_001_channel1_674s.wav

Testing vctk voices:
Voice not found: vctk/p226_023_mic1.wav
❌ p226: vctk/p226_023_mic1.wav
Voice not found: vctk/p225_023_mic1.wav
❌ p225: vctk/p225_023_mic1.wav

Selected voices:
Speaker 1: expresso/ex03-ex01_happy_001_channel1_334s.wav
Speaker 2: expresso/ex03-ex02_narration_001_channel1_674s.wav


In [26]:
# Generate audio
print("Generating podcast audio...")
pcms = []

def _on_frame(frame):
    if (frame != -1).all():
        pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
        pcms.append(np.clip(pcm[0, 0], -1, 1))

all_entries = [entries]
all_condition_attributes = [condition_attributes]

with tts_model.mimi.streaming(len(all_entries)):
    tts_model.generate(all_entries, all_condition_attributes, on_frame=_on_frame)

Generating podcast audio...


In [27]:
# Save and play audio
audio = np.concatenate(pcms, axis=-1)

# Save to file
output_file = "transformer_podcast.wav"
sf.write(output_file, audio, tts_model.mimi.sample_rate)
print(f"Podcast saved as {output_file}")

# Play in notebook
display(Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True))

Podcast saved as transformer_podcast.wav
