In [1]:
# Fast install, might break in the future.
!pip install 'safetensors<0.6'
!pip install 'sphn<0.2'
!pip install --no-deps "moshi==0.2.11"
# Slow install (will download torch and cuda), but future proof.
# !pip install "moshi==0.2.11"

Collecting safetensors<0.6
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: safetensors
  Attempting uninstall: safetensors
    Found existing installation: safetensors 0.6.2
    Uninstalling safetensors-0.6.2:
      Successfully uninstalled safetensors-0.6.2
Successfully installed safetensors-0.5.3
Collecting sphn<0.2
  Downloading sphn-0.1.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading sphn-0.1.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m138.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sphn
Successful

In [2]:
import numpy as np
import torch
from moshi.models.loaders import CheckpointInfo
from moshi.models.tts import DEFAULT_DSM_TTS_REPO, DEFAULT_DSM_TTS_VOICE_REPO, TTSModel

from IPython.display import display, Audio

In [7]:
# Configuration
text = "That's a fantastic question! You're asking about reflex actions, and they are super important for keeping us safe. Here's how it works: Imagine you accidentally touch something really hot, like a pan on the stove. Before you can even consciously think, Ouch, that's hot!, your hand jerks away. That immediate, automatic reaction is a reflex action. It happens so quickly because it bypasses the part of your brain that handles conscious thought, going straight from detecting the danger to responding to it. This is crucial for protecting you from harm. Does that initial explanation make sense? I can go into more detail about how this rapid response happens if you'd like!"
voice = "expresso/ex03-ex01_happy_001_channel1_334s.wav"
print(f"See https://huggingface.co/{DEFAULT_DSM_TTS_VOICE_REPO} for available voices.")

See https://huggingface.co/kyutai/tts-voices for available voices.


In [8]:
# Set everything up
checkpoint_info = CheckpointInfo.from_hf_repo(DEFAULT_DSM_TTS_REPO)
tts_model = TTSModel.from_checkpoint_info(
    checkpoint_info, n_q=32, temp=0.6, device=torch.device("cuda")
)

# If you want to make a dialog, you can pass more than one turn [text_speaker_1, text_speaker_2, text_2_speaker_1, ...]
entries = tts_model.prepare_script([text], padding_between=1)
voice_path = tts_model.get_voice_path(voice)
# CFG coef goes here because the model was trained with CFG distillation,
# so it's not _actually_ doing CFG at inference time.
# Also, if you are generating a dialog, you should have two voices in the list.
condition_attributes = tts_model.make_condition_attributes([voice_path], cfg_coef=2.0)

In [9]:
print("Generating audio...")

pcms = []


def _on_frame(frame):
    print("Step", len(pcms), end="\r")
    if (frame != -1).all():
        pcm = tts_model.mimi.decode(frame[:, 1:, :]).cpu().numpy()
        pcms.append(np.clip(pcm[0, 0], -1, 1))


# You could also generate multiple audios at once by extending the following lists.
all_entries = [entries]
all_condition_attributes = [condition_attributes]
with tts_model.mimi.streaming(len(all_entries)):
    result = tts_model.generate(
        all_entries, all_condition_attributes, on_frame=_on_frame
    )

print("Done generating.")
audio = np.concatenate(pcms, axis=-1)

Generating audio...
Done generating.


In [10]:
display(Audio(audio, rate=tts_model.mimi.sample_rate, autoplay=True))