# SV2TTS Tacotron-2 single-speaker fine-tuning

The main advantage of the `SV2TTS` is that it allows really fast training on single-speaker with few data.

This notebook shows how to fine-tune a pretrained `multi-speaker SV2TTS` on `single-speaker` based on an `identification dataset`.

## Imports + model creation

In [1]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf

from models.tts import SV2TTSTacotron2, PtWaveGlow
from custom_architectures import get_architecture
from datasets import get_dataset, train_test_split, filter_dataset
from utils import plot_spectrogram, select_embedding, add_speaker_embedding
from utils.text import default_french_encoder
from utils.audio import display_audio, load_audio, embed_annotation_dataset

gpus = tf.config.list_physical_devices('GPU')

rate = 22050
model_name = 'sv2tts_fine_tuned'

print("Tensorflow version : {}".format(tf.__version__))
print("Available GPU's : {}".format(gpus))

Tensorflow version : 2.3.2
Available GPU's : [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
model = SV2TTSTacotron2.build_from_sv2tts_pretrained(pretrained_name = 'sv2tts_tacotron2_256', nom = model_name, compile = False)
print(model)

Model restoration...
Initializing submodel : tts_model !
Optimizer 'tts_model_optimizer' initilized successfully !
Submodel tts_model compiled !
  Loss : {'reduction': 'none', 'name': 'tacotron_loss', 'mel_loss': 'mse', 'mask_mel_padding': True, 'label_smoothing': 0, 'finish_weight': 1.0, 'not_finish_weight': 1.0, 'from_logits': False}
  Optimizer : {'name': 'Adam', 'learning_rate': 0.0005000000237487257, 'decay': 0.0, 'beta_1': 0.8999999761581421, 'beta_2': 0.9990000128746033, 'epsilon': 1e-07, 'amsgrad': False}
  Metrics : []
Successfully restored tts_model from pretrained_models/sv2tts_tacotron2_256/saving/tts_model.json !
Model sv2tts_tacotron2_256 initialized successfully !
Initializing submodel : tts_model !
Submodel tts_model saved in pretrained_models\sv2tts_fine_tuned\saving\tts_model.json !
Model sv2tts_fine_tuned initialized successfully !
Weights transfered successfully !
Submodel tts_model saved in pretrained_models\sv2tts_fine_tuned\saving\tts_model.json !

Sub model tts_

## Model initialization

In [None]:
model = SV2TTSTacotron2(nom = model_name)

lr = { 'name': 'WarmupScheduler', 'maxval' : 5e-4, 'minval' : 25e-5, 'factor' : 256, 'warmup_steps' : 256}

model.compile(optimizer = 'adam', optimizer_config = {'lr' : lr})

print(model)

## Dataset creation / loading

In [None]:
path = 'D:/datasets/...'

dataset = embed_annotation_dataset(
    path, embed_fn = model.speaker_encoder.embed, embedding_dim = model.embedding_dim, rate = model.speaker_encoder.audio_rate,
    embedding_name = 'embeddings_256_mel_lstm.csv'
)


In [3]:
kwargs = {
    'directory'      : 'D:/datasets/...',
    'type_annots'    : 'identification',
    'embedding_dim'  : model.speaker_embedding_dim,
    'embedding_name' : 'embeddings_256_mel_lstm.csv'
}

dataset = get_dataset(
    '...', ds_type = 'custom', ** kwargs
)

dataset = filter_dataset(dataset,id = '...')
dataset.pop('indexes')

print("Dataset length : {} ({} speakers)".format(
    len(dataset), len(dataset['id'].unique())
))

Loading dataset SAO...
Dataset length : 489 (1 speakers)


## Training

In [None]:
epochs          = 15
batch_size      = 16
valid_batch_size    = batch_size

max_valid_size  = min(int(0.1 * len(dataset)), 256 * valid_batch_size)

train_size      = min(1024 * batch_size, len(dataset) - max_valid_size)
valid_size      = min(len(dataset) - train_size, max_valid_size)

shuffle_size    = batch_size * 8
pred_step       = -1

""" Custom training hparams """
augment_prct        = 0.2
augment_speaker_embedding   = False

trim_audio      = True
reduce_noise    = False
trim_threshold  = 0.025
max_silence     = 0.1
trim_method     = 'remove'
trim_mode       = 'start_end'

trim_mel     = False
trim_factor  = 0.6
trim_mel_method  = 'max_start_end'

# Seems to be interesting for single-speaker fine-tuning
# and for a better generalization but seems to slow down convergence 
use_utterance_embedding = True

max_input_length = 300
max_output_length = 2048

""" Training """

train, valid = train_test_split(
    dataset, train_size = train_size, valid_size = valid_size, shuffle = True
)

print("Training samples   : {} - {} batches - {} speakers".format(
    len(train), len(train) // batch_size, len(train['id'].unique())
))
print("Validation samples : {} - {} batches - {} speakers".format(
    len(valid), len(valid) // valid_batch_size, len(valid['id'].unique())
))

# This feature seems interesting for singl-speaker fine-tuning
# If you want to enable it, put `trainable = False`
trainable = False
if model.epochs >= 5:
    model.tts_model.postnet.trainable = trainable
if model.epochs >= 10:
    model.tts_model.encoder.trainable = trainable

model.train(
    train, validation_data = valid, 
    epochs = epochs, batch_size = batch_size, valid_batch_size = valid_batch_size,

    max_input_length = max_input_length, max_output_length = max_output_length,
    shuffle_size = shuffle_size, pred_step = pred_step,
    augment_prct = augment_prct, augment_speaker_embedding = augment_speaker_embedding,
    
    trim_audio = trim_audio, reduce_noise = reduce_noise, trim_threshold = trim_threshold,
    max_silence = max_silence, trim_method = trim_method, trim_mode = trim_mode,
    
    trim_mel = trim_mel, trim_factor = trim_factor, trim_mel_method = trim_mel_method,
    
    use_utterance_embedding = use_utterance_embedding
)



In [None]:
model.plot_history()

## Complete inference

These cells allow you to test your model with a complete inference pipeline

Note that you have to restart your kernel then execute 1st cell (imports) then cells below. You **must** first instanciate the `PtWaveGlow` model which will call `limit_gpu_memory` to reduce visible GPU memory for tensorflow (to allow a better coexistance of both libraries). 

In [None]:
waveglow = PtWaveGlow()
model    = SV2TTSTacotron2(nom = model_name)

In [None]:
def full_inference(text, embedding, n = 1):
    encoded = tf.expand_dims(model.encode_text(text), axis = 0)
    
    _, mel, _, attn = model.infer(
        encoded, [tf.shape(encoded)[1]], embedding
    )
    
    mel = np.squeeze(mel, 0)
    
    plot_spectrogram(inference = mel, attention = attn)
    audio = waveglow.infer(mel)

    display_audio(audio, rate = rate)
    return audio

kwargs = {
    'directory'      : 'D:/datasets/...',
    'type_annots'    : 'identification',
    'embedding_dim'  : model.speaker_embedding_dim,
    'embedding_name' : 'embeddings_256_mel_lstm.csv'
}

dataset = get_dataset(
    '...', ds_type = 'custom', ** kwargs
)

dataset = filter_dataset(dataset, id = '...')
dataset = add_speaker_embedding(dataset, 'emotion', 'mean')


In [None]:
text = "Bonjour tout le monde ! Voici une démonstration du modèle en français."

x = random.randrange(0, len(dataset))
display_audio(dataset.at[x, 'filename'])
embedding = select_embedding(dataset, mode = x)

silence = np.zeros((int(rate * 0.15),))
audios = []
if not isinstance(text, list): text = [text]

for p in text:
    for _ in range(2):
        audio = full_inference(p, embedding)
    audios.append(audio)
    audios.append(silence)

if len(text) > 1:
    audios = np.concatenate(audios)
    _ = display_audio(audios, rate)

## Waveglow inference on training generated audios

This is a demonstration on prediction at step 500 so it is normal that inference is so bad ;)

In [None]:
def infer_with_target(model_name, step, n, mode, save = False, display = True):
    if mode == 'train':
        directory = os.path.join('pretrained_models', model_name, 'training-logs', 'eval', 'mels')
        filename = 'pred_step-{:06d}_{}_target.npy'.format(step, n)
        pred_filename = 'pred_step-{:06d}_{}_pred.npy'.format(step, n)
        infer_filename = 'pred_step-{:06d}_{}_infer.npy'.format(step, n)
    else:
        directory = os.path.join('pretrained_models', model_name, 'outputs', 'mels')
        filename = 'pred_{}_target.npy'.format(n)
        pred_filename = 'pred_{}_pred.npy'.format(n)
        infer_filename = 'pred_{}_infer.npy'.format(n)

    if not os.path.exists(os.path.join(directory, filename)): return
    
    target = np.load(os.path.join(directory, filename))
    pred   = np.load(os.path.join(directory, pred_filename))
    infer  = np.load(os.path.join(directory, infer_filename))
    
    audio       = waveglow.infer(target)
    audio_pred  = waveglow.infer(pred)
    audio_infer = waveglow.infer(infer)

    _ = display_audio(audio, rate = rate)
    _ = display_audio(audio_pred, rate = rate)
    _ = display_audio(audio_infer, rate = rate)
    
    if save:
        save_dir = directory.replace('mels', 'audios')
        os.makedirs(save_dir, exist_ok = True)
        write_audio(audio, os.path.join(save_dir, filename[:-3] + 'mp3'), rate = rate)
        write_audio(audio_pred, os.path.join(save_dir, pred_filename[:-3] + 'mp3'), rate = rate)
        write_audio(audio_infer, os.path.join(save_dir, infer_filename[:-3] + 'mp3'), rate = rate)
    
    plot_spectrogram(
        target = target, prediction = pred, inference = infer
    )

waveglow = get_architecture('nvidia_waveglow')

In [None]:
step, mode = 500, 'train'

for n in range(5):
    infer_with_target(model_name, step, n, mode)


## Tests

In [None]:
from custom_train_objects.optimizers import WarmupScheduler

lr = WarmupScheduler(maxval = 75e-5, minval = 25e-5, factor = 1024)
print(lr.get_config())
lr.plot(1024 * 10)