In [1]:
import os
import librosa
import numpy as np
import torch
import matplotlib.pyplot as plt

from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
import torchvision.transforms.functional as TF

Load audio file example and compute mel spectrogram
Will plot it as well

In [17]:
# Load audio file example
audio_file = '84_121123_000069_000000.wav'

audio, sr = librosa.load(audio_file, sr=22050)  # Adjust sr as needed
mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=1024, hop_length=256, n_mels=5)
mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)

print(f'{mel_spectrogram.shape=}')
plt.figure(figsize=(7, 3))
plt.imshow(mel_spectrogram, aspect='auto', origin='lower')
plt.colorbar()
plt.savefig('mel_spectrogram.png')

mel_spectrogram.shape=(5, 1201)



Make encoder and decoder using Tacotron2 model
Taken as "Tacotron2().encoder" and "Tacotron2().decoder"

In [63]:
current_path = os.path.dirname(os.getcwd())
output_path = os.path.join(current_path, "runs")

# define model config
config = Tacotron2Config(
    batch_size=4,
    eval_batch_size=4,
    num_loader_workers=0,
    num_eval_loader_workers=0,
    precompute_num_workers=0,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1,
    print_step=1,
    print_eval=True,
    mixed_precision=False,
    output_path=output_path,
    # datasets=[dataset_config],
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=0,
    max_audio_len=500000,
)

ap = AudioProcessor.init_from_config(config, verbose=False)
tokenizer, config = TTSTokenizer.init_from_config(config)
tacotron2_model = Tacotron2(config, ap, tokenizer, speaker_manager=SpeakerManager())

 > Init speaker_embedding layer.


# Encoder analysis

In [69]:
# %%
mel_spectrogram_length = mel_spectrogram.shape[1]
encoder = tacotron2_model.encoder

# Assuming mel_spectrogram is your preprocessed mel spectrogram
reshaped_mel_spectrogram = torch.tensor(mel_spectrogram)
print(f'{mel_spectrogram.shape=}')

# Reshape mel spectrogram to (512, n) using interpolation
target_shape = (512, mel_spectrogram_length)
reshaped_mel_spectrogram = reshaped_mel_spectrogram.unsqueeze(0).permute(0, 2, 1)
reshaped_mel_spectrogram = TF.resize(reshaped_mel_spectrogram, size=target_shape, interpolation=TF.InterpolationMode.BILINEAR)

input_lengths = torch.tensor([reshaped_mel_spectrogram.shape[-1]])
encoder.in_out_channels = input_lengths.detach().cpu().numpy()[0]
print(f'{reshaped_mel_spectrogram.shape=}, {input_lengths=}')

# Pass mel spectrogram and input lengths through the encoder
memories = encoder(reshaped_mel_spectrogram, input_lengths)
print(f'{memories.shape=}, {memories.dtype=}')

plt.figure(figsize=(10, 4))
image = memories.permute(2, 1, 0).detach().numpy()
image = np.reshape(image, (image.shape[1], image.shape[0])).T
print(f'{image.shape=}')
plt.imshow(image, aspect='auto', origin='lower')
plt.colorbar()
plt.savefig('embedding2.png')

mel_spectrogram.shape=(5, 1201)
reshaped_mel_spectrogram.shape=torch.Size([1, 512, 1201]), input_lengths=tensor([1201])
memories.shape=torch.Size([1, 1201, 512]), memories.dtype=torch.float32
image.shape=(512, 1201)



# Decoder analysis

In [57]:
import torch

e = 512
m = 64

# Create dummy input tensors
dummy_embedding = torch.randn(1, e, 1024)
dummy_memories = torch.randn(1, m, 160)

# Print shapes of dummy input tensors
print("Shapes of dummy input tensors:")
print("Embedding:", dummy_embedding.shape)
print("Memories:", dummy_memories.shape)
print()

# Pass the dummy input tensors to the decoder
dummy_decoder_outputs = tacotron2_model.decoder(
    dummy_embedding,
    memories=dummy_memories,
    mask=None,
)

for i, content in enumerate(dummy_decoder_outputs):
    print(f'Element {i}')
    example = content.detach().cpu().numpy().flatten()
    print('Content:', example[:5], '...' if len(example) > 1 else '')
    print('Shape:', content.shape)
    print()


Shapes of dummy input tensors:
Embedding: torch.Size([1, 512, 1024])
Memories: torch.Size([1, 64, 160])

Element 0
Content: [-0.01196379  0.12897922 -0.02115425  0.14590442 -0.03939753] ...
Shape: torch.Size([1, 80, 128])

Element 1
Content: [0.00066258 0.00251465 0.00038548 0.00262291 0.00081881] ...
Shape: torch.Size([1, 64, 512])

Element 2
Content: [-0.00600709 -0.0101943  -0.0421669  -0.08888654 -0.08210397] ...
Shape: torch.Size([1, 64])



In [76]:
# %%

dummy_embedding = torch.randn(1, 1, 1024)
embedding = dummy_embedding
memories = memories[:, :, :160]

# Print shapes of input tensors
print("Shapes of input tensors:")
print("Embedding:", embedding.shape)
print("Memories:", memories.shape)
print()

# Pass the input tensors to the decoder
decoder_outputs = tacotron2_model.decoder(
    dummy_embedding,
    memories=memories,
    mask=None,
)

print(tacotron2_model.decoder.decode(embedding))

for i, content in enumerate(decoder_outputs):
    print(f'Element {i}')
    example = content.detach().cpu().numpy().flatten()
    print('Content:', example[:5], '...' if len(example) > 1 else '')
    print('Shape:', content.shape)
    print()

Shapes of input tensors:
Embedding: torch.Size([1, 1, 1024])
Memories: torch.Size([1, 1201, 160])



RuntimeError: Tensors must have same number of dimensions: got 3 and 2

# Stand alone Decoder analysis

In [74]:
from TTS.tts.layers.tacotron.tacotron2 import Decoder

e = 512
m = 64

# Create dummy input tensors
dummy_embedding = torch.randn(1, e, 1024)

# Print shapes of dummy input tensors
print("Shapes of dummy input tensors:")
print("Embedding:", dummy_embedding.shape)
print()

# Pass the dummy input tensors to the decoder
decoder = Decoder()
decoder_outputs = decoder.decode(dummy_embedding)



Shapes of dummy input tensors:
Embedding: torch.Size([1, 512, 1024])



TypeError: Decoder.__init__() missing 15 required positional arguments: 'in_channels', 'frame_channels', 'r', 'attn_type', 'attn_win', 'attn_norm', 'prenet_type', 'prenet_dropout', 'forward_attn', 'trans_agent', 'forward_attn_mask', 'location_attn', 'attn_K', 'separate_stopnet', and 'max_decoder_steps'

In [None]:
# load audio processor and speaker encoder
ap = AudioProcessor(**config.audio)

manager = SpeakerManager(encoder_model_path=encoder_model_path, encoder_config_path=encoder_config_path)

# load a sample audio and compute embedding
waveform = ap.load_wav(sample_wav_path)

mel = ap.melspectrogram(waveform)

d_vector = manager.compute_embeddings(mel.T)

In [None]:

# Assuming mel_spectrogram is your preprocessed mel spectrogram
reshaped_mel_spectrogram = torch.tensor(mel_spectrogram)

# Reshape mel spectrogram to (512, 512) using interpolation
target_shape = (512, 512)
reshaped_mel_spectrogram = reshaped_mel_spectrogram.unsqueeze(0).permute(0, 2, 1)
reshaped_mel_spectrogram = TF.resize(reshaped_mel_spectrogram, size=target_shape, interpolation=TF.InterpolationMode.BILINEAR)

input_lengths = torch.tensor([reshaped_mel_spectrogram.shape[-1]])
encoder_output_tensor  = tacotron2_model.encoder(reshaped_mel_spectrogram, input_lengths)
print(encoder_output_tensor.shape)

# # Extract the embedding
# embedding = encoder_output_tensor["encoder_out"]
dummy_embedding = encoder_output_tensor
print(dummy_embedding.shape)

# Pass the embedding through the decoder
decoder_outputs, _ = tacotron2_model.decoder(
    dummy_embedding, memories=encoder_output_tensor[:, :160, :160], mask=None)

# Get the predicted mel spectrogram from the decoder outputs
predicted_mel_spectrogram = decoder_outputs["mel_outputs"]

# You can also get the stop token predictions if needed
stop_token_predictions = decoder_outputs["stop_token_predictions"]

print(predicted_mel_spectrogram.shape)