## In this Notebook, we will look at the streaming of the audio using the ParlerTTS model.

In [7]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
from transformers import AutoTokenizer
from threading import Thread
from IPython.display import Audio, display
import numpy as np

In [2]:
torch_device = "cuda" 
model_name = "parler-tts/parler_tts_mini_v0.1"

In [3]:
max_length = 50
tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = ParlerTTSForConditionalGeneration.from_pretrained(model_name,).to(torch_device)

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
  WeightNorm.apply(module, name, dim)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      

In [4]:
sampling_rate = model.audio_encoder.config.sampling_rate
frame_rate = model.audio_encoder.config.frame_rate

In [5]:
def generate(text, description, play_steps_in_s=0.5):
  play_steps = int(frame_rate * play_steps_in_s)
  streamer = ParlerTTSStreamer(model, device=torch_device, play_steps=play_steps)
  
  inputs = tokenizer(description, return_tensors="pt").to(torch_device)
  prompt = tokenizer(text, return_tensors="pt").to(torch_device)
  
  generation_kwargs = dict(
    input_ids=inputs.input_ids,
    prompt_input_ids=prompt.input_ids,
    attention_mask=inputs.attention_mask,
    prompt_attention_mask=prompt.attention_mask,
    streamer=streamer,
    do_sample=True,
    temperature=1.0,
    min_new_tokens=10,
  )
  
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
  thread.start()
  audio_chunks = []
  for new_audio in streamer:
      print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
      audio_widget = Audio(data=new_audio, rate=sampling_rate, autoplay=True)
      display(audio_widget)
      audio_chunks.append(new_audio)
  
  
  combined_audio = np.concatenate(audio_chunks)
  
  
  print("\nFull audio length:", round(combined_audio.shape[0] / sampling_rate, 2), "seconds")
  final_audio = Audio(data=combined_audio, rate=sampling_rate)
  display(final_audio)
  
  return combined_audio

## Dataset Examples

In [6]:
examples = [
    [
        "Hello world.",
        "A female speaker delivers a warm and welcoming message with a slightly expressive and friendly tone. The speech has a moderate pace and a natural intonation, making it feel inviting. The recording is clear, with minimal background noise.",
        3.0,
    ],
    [
        "A futuristic AI assistant responding in a clear, robotic yet friendly manner",
        "A robotic AI voice speaks in a neutral yet slightly friendly manner. The speech is steady, with a consistent pitch and minimal variation in tone. The audio is crisp, resembling a synthesized assistant's response.",
        3.0,
    ],
    [
        "A dramatic voice-over for an action-packed movie trailer that builds suspense",
        "A deep male voice narrates with a dramatic and intense tone, building suspense with pauses and rising intensity. The speech is slow-paced, emphasizing key moments. The recording is cinematic, with a slight reverberation for a grand effect.",
        3.0,
    ],
    [
        "A casual and friendly conversation starter that feels natural and engaging",
        "A young adult male speaker talks in a casual and relaxed tone. The speech is natural, with slight variations in pitch and pauses that mimic real-life conversations. The recording is high quality, making it feel like a personal chat.",
        3.0,
    ],
    [
        "An enchanting introduction to a bedtime story that sparks imagination",
        "A soft-spoken female speaker introduces a bedtime story with a soothing and melodic tone. The pace is slow and gentle, making it easy to follow. The recording has a slight warmth, resembling a close-up microphone capture.",
        3.0,
    ],
    [
        "A detailed weather update that provides temperature, wind speed, and overall forecast",
        "A professional male voice delivers a clear and informative weather update. The tone is neutral but engaging, with a moderate pace. The articulation is precise, and the recording is high quality, with no distortions.",
        3.0,
    ],
    [
        "A robotic voice delivering a monotone yet precise system status update",
        "A synthetic robotic voice speaks in a completely monotone and even-paced manner. The pitch remains constant, with no emotional inflection. The recording is crisp and clean, resembling an automated system response.",
        3.0,
    ],
    [
        "A powerful and energetic motivational speech that inspires action and confidence",
        "A dynamic and energetic male speaker delivers an inspiring speech with a strong, enthusiastic tone. The pace is varied, with emphasis on key words to motivate the listener. The recording is sharp and immersive, with no background noise.",
        3.0,
    ],
]


## Audio Generation

In [105]:
text_1, description_1, play_steps_in_s_1 = examples[2]
for chunk in generate(text_1, description_1, play_steps_in_s_1):
    pass  

Sample of length: 2.41 seconds


Sample of length: 3.0 seconds


Sample of length: 1.81 seconds



Full audio length: 7.21 seconds
