## In this notebook, we will analyze and identify the key bottlenecks affecting the model's latency and throughput.

In [2]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration 
from transformers import AutoTokenizer
import soundfile as sf
import time
from torch.profiler import profile, record_function, ProfilerActivity

Flash attention 2 is not installed


In [14]:
prompts = [
    "Hello world.",
    "A futuristic AI assistant responding in a clear, robotic yet friendly manner",
    "A dramatic voice-over for an action-packed movie trailer that builds suspense",
    "A casual and friendly conversation starter that feels natural and engaging",
    "An enchanting introduction to a bedtime story that sparks imagination",
    "A detailed weather update that provides temperature, wind speed, and overall forecast",
    "A robotic voice delivering a monotone yet precise system status update",
    "A powerful and energetic motivational speech that inspires action and confidence"
]

descriptions = [
    "A female speaker delivers a warm and welcoming message with a slightly expressive and friendly tone. The speech has a moderate pace and a natural intonation, making it feel inviting. The recording is clear, with minimal background noise.",
    "A robotic AI voice speaks in a neutral yet slightly friendly manner. The speech is steady, with a consistent pitch and minimal variation in tone. The audio is crisp, resembling a synthesized assistant's response.",
    "A deep male voice narrates with a dramatic and intense tone, building suspense with pauses and rising intensity. The speech is slow-paced, emphasizing key moments. The recording is cinematic, with a slight reverberation for a grand effect.",
    "A young adult male speaker talks in a casual and relaxed tone. The speech is natural, with slight variations in pitch and pauses that mimic real-life conversations. The recording is high quality, making it feel like a personal chat.",
    "A soft-spoken female speaker introduces a bedtime story with a soothing and melodic tone. The pace is slow and gentle, making it easy to follow. The recording has a slight warmth, resembling a close-up microphone capture.",
    "A professional male voice delivers a clear and informative weather update. The tone is neutral but engaging, with a moderate pace. The articulation is precise, and the recording is high quality, with no distortions.",
    "A synthetic robotic voice speaks in a completely monotone and even-paced manner. The pitch remains constant, with no emotional inflection. The recording is crisp and clean, resembling an automated system response.",
    "A dynamic and energetic male speaker delivers an inspiring speech with a strong, enthusiastic tone. The pace is varied, with emphasis on key words to motivate the listener. The recording is sharp and immersive, with no background noise."
]


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

  WeightNorm.apply(module, name, dim)
Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_

## Pytorch Profiler

In [19]:
def generate_audio_with_profiling(model, tokenizer, prompt, description, output_file="parler_tts_out.wav"):
    device = next(model.parameters()).device
    
    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
                profile_memory=True, record_shapes=True) as prof:
        
        with record_function("tokenization"):
            input_ids = tokenizer(description, return_tensors="pt").to(device)
            prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)
        
        with record_function("generation"):
            generation = model.generate(input_ids=input_ids.input_ids,
                attention_mask=input_ids.attention_mask,
                prompt_input_ids=prompt_input_ids.input_ids,
                prompt_attention_mask=prompt_input_ids.attention_mask)
            audio_arr = generation.cpu().numpy().squeeze()
        
        sf.write(output_file, audio_arr, model.config.sampling_rate)

    print("\nCUDA and CPU Usage:")
    print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
    print("\nMemory Usage:")
    print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=10))
    
    return audio_arr


In [20]:
for i in range(2):
    generate_audio_with_profiling(model, tokenizer, prompts[i], descriptions[i])

`prompt_attention_mask` is specified but `attention_mask` is not. A full `attention_mask` will be created. Make sure this is the intended behaviour.



CUDA and CPU Usage:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                             generation         0.00%       0.000us         0.00%       0.000us       0.000us        4.355s      1202.27%        4.355s        2.177s           0 b           0 b      

## Calculate RTFx and Latency

In [27]:
def measure_performance(model1, tokenizer, prompt, description):
    try:

        prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)
        input_ids = tokenizer(description, return_tensors="pt").to(device)
        

        start_time = time.time()
        with torch.no_grad():

            audio = model1.generate(input_ids=input_ids.input_ids,
                attention_mask=input_ids.attention_mask,
                prompt_input_ids=prompt_input_ids.input_ids,
                prompt_attention_mask=prompt_input_ids.attention_mask) 
        end_time = time.time()
        
        
        generation_time = end_time - start_time
        audio_length = audio.shape[-1] / model.config.sampling_rate
        rtfx = generation_time / audio_length
        
        return generation_time, audio_length, rtfx
        
    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None, None, None

In [28]:
results = []
for i in range(len(prompts)):
    prompt, description = prompts[i], descriptions[i]
    generation_time, audio_length, rtfx = measure_performance(model, tokenizer, prompt, description)
    if generation_time is not None:
        results.append({
            'prompt_length': len(prompt),
            'generation_time': generation_time,
            'audio_length': audio_length,
            'rtfx': rtfx
        })
        print(f"\nPrompt length: {len(prompt)} characters")
        print(f"Generation time: {generation_time:.2f} seconds")
        print(f"Audio length: {audio_length:.2f} seconds") 
        print(f"RTFx: {rtfx:.2f}")
    else:
        print(f"\nFailed to process prompt: {prompt}")



Prompt length: 12 characters
Generation time: 2.56 seconds
Audio length: 1.17 seconds
RTFx: 2.18

Prompt length: 76 characters
Generation time: 9.35 seconds
Audio length: 4.78 seconds
RTFx: 1.96

Prompt length: 77 characters
Generation time: 9.61 seconds
Audio length: 4.89 seconds
RTFx: 1.97

Prompt length: 74 characters
Generation time: 10.14 seconds
Audio length: 5.15 seconds
RTFx: 1.97

Prompt length: 69 characters
Generation time: 9.52 seconds
Audio length: 4.81 seconds
RTFx: 1.98

Prompt length: 85 characters
Generation time: 7.64 seconds
Audio length: 3.77 seconds
RTFx: 2.02

Prompt length: 70 characters
Generation time: 8.24 seconds
Audio length: 4.02 seconds
RTFx: 2.05

Prompt length: 80 characters
Generation time: 9.79 seconds
Audio length: 4.88 seconds
RTFx: 2.01


In [56]:
input_ids = tokenizer(descriptions, return_tensors="pt", padding="max_length",max_length=80).to(device)
prompt_input_ids = tokenizer(prompts, return_tensors="pt", padding="max_length",max_length=80).to(device)

## Generation when batching is done

In [31]:
start_time = time.time()
with torch.no_grad():
    audio = model.generate(input_ids=input_ids.input_ids,
        attention_mask=input_ids.attention_mask,
        prompt_input_ids=prompt_input_ids.input_ids,
        prompt_attention_mask=prompt_input_ids.attention_mask) 
end_time = time.time()
generation_time = end_time - start_time
print(f"Generation time: {generation_time:.2f} seconds")

Generation time: 18.75 seconds


In [None]:
output_file = "generated_audio.wav"
audio_arr = audio[1].cpu().to(torch.float32).numpy().squeeze()
sf.write(output_file, audio_arr, model.config.sampling_rate)

## When we increase the num of tokens, the generation time increases.

In [32]:
start_time = time.time()
with torch.no_grad():
    audio = model.generate(input_ids=input_ids.input_ids,
        attention_mask=input_ids.attention_mask,
        prompt_input_ids=prompt_input_ids.input_ids,
        prompt_attention_mask=prompt_input_ids.attention_mask,
        min_new_tokens=100) 
end_time = time.time()
generation_time = end_time - start_time
print(f"Generation time: {generation_time:.2f} seconds")

Generation time: 24.57 seconds


In [None]:
output_file = "generated_audio.wav"
audio_arr = audio[1].cpu().to(torch.float32).numpy().squeeze()
sf.write(output_file, audio_arr, model.config.sampling_rate)

## Synchronize the GPU, When we increase the num of tokens, the generation time increases 

In [59]:
torch.cuda.synchronize()
start_time = time.time()
with torch.no_grad():
    audio = model.generate(input_ids=input_ids.input_ids,
        attention_mask=input_ids.attention_mask,
        prompt_input_ids=prompt_input_ids.input_ids,
        prompt_attention_mask=prompt_input_ids.attention_mask,
        min_new_tokens=100) 
end_time = time.time()
torch.cuda.synchronize()
generation_time = end_time - start_time
print(f"Generation time: {generation_time:.2f} seconds")

Generation time: 19.78 seconds


In [None]:
output_file = "generated_audio.wav"
audio_arr = audio[1].cpu().to(torch.float32).numpy().squeeze()
sf.write(output_file, audio_arr, model.config.sampling_rate)