In [20]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
from transformers import AutoTokenizer
from threading import Thread

torch_device = "cuda:0" # Use "mps" for Mac 
torch_dtype = torch.bfloat16
model_name = "ai4bharat/indic-parler-tts"

# need to set padding max length
max_length = 50

# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = ParlerTTSForConditionalGeneration.from_pretrained(
    model_name,
).to(torch_device, dtype=torch_dtype)

sampling_rate = model.audio_encoder.config.sampling_rate
frame_rate = model.audio_encoder.config.frame_rate


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

Config of the audio_encoder: <class 'transformers.models.dac.modelin

In [38]:

def generate(text, description, play_steps_in_s=0.5):
  play_steps = int(frame_rate * play_steps_in_s)
  streamer = ParlerTTSStreamer(model, device=torch_device, play_steps=play_steps)
  # tokenization
  inputs = tokenizer(description, return_tensors="pt").to(torch_device)
  prompt = tokenizer(text, return_tensors="pt").to(torch_device)
  # create generation kwargs
  generation_kwargs = dict(
    input_ids=inputs.input_ids,
    prompt_input_ids=prompt.input_ids,
    attention_mask=inputs.attention_mask,
    prompt_attention_mask=prompt.attention_mask,
    streamer=streamer,
    do_sample=True,
    temperature=1.0,
    min_new_tokens=10,
  )
  # initialize Thread
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
  thread.start()
  # iterate over chunks of audio
  for new_audio in streamer:
    if new_audio.shape[0] == 0:
      break
    print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 4)} seconds")
    yield sampling_rate, new_audio



In [50]:

# now you can do
text = "हाय अदिति, आज तुम बहुत खुश लग रही हो, क्या कोई खास बात है?"
description = "Rohan speaks at a fast pace with a slightly low-pitched voice, captured clearly in a close-sounding environment with excellent recording quality."


In [52]:
import time
import numpy as np
from IPython.display import Audio, display

start_time = time.time()

audio_chunks = []
final_sampling_rate = None

for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
    if audio_chunk.size == 0:
        continue

    audio_chunks.append(audio_chunk)
    final_sampling_rate = sampling_rate

# concatenate all chunks
final_audio = np.concatenate(audio_chunks)

end_time = time.time()

# stats
processing_time = round(end_time - start_time, 2)
audio_length_sec = round(final_audio.shape[0] / final_sampling_rate, 2)

print(f"Total processing time: {processing_time} seconds")
print(f"Final audio length: {audio_length_sec} seconds")
print(f"Real-time factor (RTF): {round(processing_time / audio_length_sec, 2)}")

# single audio player
display(Audio(final_audio, rate=final_sampling_rate))


Sample of length: 0.3297 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.4992 seconds
Sample of length: 0.2508 seconds
Total processing time: 6.78 seconds
Final audio length: 5.07 seconds
Real-time factor (RTF): 1.34


In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

Config of the audio_encoder: <class 'transformers.models.dac.modelin

In [66]:

prompt = "हाय अदिति, आज तुम बहुत खुश लग रही हो, क्या कोई खास बात है?"
description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."

description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)

generation = model.generate(input_ids=description_input_ids.input_ids, attention_mask=description_input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("indic_tts_out.wav", audio_arr, model.config.sampling_rate)


In [70]:
sentences = [
    "अरे, तुम आज कैसे हो?",
    "मैं आज बहुत अच्छा महसूस कर रहा हूँ।",   
    "क्या कोई खास बात है?"
]

prompt_inputs = tokenizer(sentences, padding=True, return_tensors="pt").to(device)
desc_inputs = description_tokenizer(
    [description] * len(sentences),
    padding=True,
    return_tensors="pt"
).to(device)

generation = model.generate(
    input_ids=desc_inputs.input_ids,
    attention_mask=desc_inputs.attention_mask,
    prompt_input_ids=prompt_inputs.input_ids,
    prompt_attention_mask=prompt_inputs.attention_mask
)



In [71]:

audios = generation.cpu().numpy()  # DO NOT squeeze

for i, audio in enumerate(audios):
    sf.write(
        f"indic_tts_out_{i}.wav",
        audio,
        model.config.sampling_rate
    )


In [53]:
from IPython.display import Audio, display
import numpy as np
audio_buffer = []
audio_display = None

# now you can do
text = "This is a test of the streamer class"
description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."

chunk_size_in_s = 1

for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
  # You can do everything that you need with the chunk now
  # For example: stream it, save it, play it.
  print(audio_chunk.shape) 
for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
    if audio_chunk.size == 0:
        continue

    audio_buffer.append(audio_chunk)

    # play every ~1 second
    if len(audio_buffer) == 2:
        combined = np.concatenate(audio_buffer)
        audio_buffer = []

        audio = Audio(combined, rate=sampling_rate, autoplay=True)

        if audio_display is None:
            audio_display = display(audio, display_id=True)
        else:
            audio_display.update(audio)


Sample of length: 0.7564 seconds
(33358,)
Sample of length: 1.0101 seconds
(44544,)
Sample of length: 1.0101 seconds
(44544,)
Sample of length: 1.0101 seconds
(44544,)
Sample of length: 0.3349 seconds
(14770,)
Sample of length: 0.7564 seconds
Sample of length: 1.0101 seconds


Sample of length: 0.7413 seconds


In [28]:

# now you can do
text = "This is a test of the streamer class"
description = "Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise."

chunk_size_in_s = 0.5

for (sampling_rate, audio_chunk) in generate(text, description, chunk_size_in_s):
  # You can do everything that you need with the chunk now
  # For example: stream it, save it, play it.
  print(audio_chunk.shape) 

Sample of length: 0.3297 seconds
(14541,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)
Sample of length: 0.4992 seconds
(22016,)


KeyboardInterrupt: 

In [23]:
print("Tokenizer vocab:", tokenizer.vocab_size)
print("Frame rate:", frame_rate)
print("Sampling rate:", sampling_rate)


Tokenizer vocab: 90714
Frame rate: 87
Sampling rate: 44100
