<a href="https://colab.research.google.com/github/wojtekk23/tacotron2/blob/embed_at_each_step/demos/VoiceCloning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Voice Cloning

# Dataset

# Architecture

# Preparations

In [None]:
!git clone https://github.com/wojtekk23/tacotron2.git
!cd tacotron2; git checkout embed_at_each_step; git submodule init; git submodule update

Cloning into 'tacotron2'...
remote: Enumerating objects: 478, done.[K
remote: Counting objects: 100% (75/75), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 478 (delta 43), reused 50 (delta 22), pack-reused 403[K
Receiving objects: 100% (478/478), 3.00 MiB | 10.73 MiB/s, done.
Resolving deltas: 100% (247/247), done.
Branch 'embed_at_each_step' set up to track remote branch 'embed_at_each_step' from 'origin'.
Switched to a new branch 'embed_at_each_step'
Submodule 'waveglow' (https://github.com/NVIDIA/waveglow) registered for path 'waveglow'
Cloning into '/content/tacotron2/waveglow'...
Submodule path 'waveglow': checked out '5bc2a53e20b3b533362f974cfa1ea0267ae1c2b1'


In [None]:
!pip install torch torchvision torchaudio tqdm resemblyzer
!pip install -r /content/tacotron2/requirements.txt

Collecting tensorflow==1.15.2
  Downloading tensorflow-1.15.2-cp37-cp37m-manylinux2010_x86_64.whl (110.5 MB)
[K     |████████████████████████████████| 110.5 MB 33 kB/s 
Collecting Unidecode
  Downloading Unidecode-1.3.3-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 17.7 MB/s 
Collecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 38.0 MB/s 
Collecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 48.1 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Collecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.8 MB/s 
Building wheels for collected packages: gast
  Building wheel for gast (setup.py) ... [?25l[?25hdone
  Created wheel for gast: filename

# Training

## Training parameters

In [None]:
import sys
sys.path.append('/content/tacotron2/')
import tensorflow as tf
from text import symbols


def create_hparams(hparams_string=None, verbose=False):
    """Create model hyperparameters. Parse nondefault from given string."""

    hparams = tf.contrib.training.HParams(
        ################################
        # Experiment Parameters        #
        ################################
        epochs=500,
        iters_per_checkpoint=1000,
        seed=1234,
        dynamic_loss_scaling=True,
        fp16_run=False,
        distributed_run=False,
        dist_backend="nccl",
        dist_url="tcp://localhost:54321",
        cudnn_enabled=True,
        cudnn_benchmark=False,
        ignore_layers=['embedding.weight'],

        ################################
        # Data Parameters             #
        ################################
        load_mel_from_disk=False,
        training_files='/content/tacotron2/filelists/vctk_filelist_train.txt',
        validation_files='/content/tacotron2/filelists/vctk_filelist_valid.txt',
        embedding_files='/content/tacotron2/filelists/vctk_embeddings.txt',
        #embedding_files='/content/tacotron2/filelists/vctk_speaker_embeds.txt',
        text_cleaners=['english_cleaners'],

        ################################
        # Audio Parameters             #
        ################################
        max_wav_value=32768.0,
        sampling_rate=22050,
        filter_length=1024,
        hop_length=256,
        win_length=1024,
        n_mel_channels=80,
        mel_fmin=0.0,
        mel_fmax=8000.0,

        ################################
        # Model Parameters             #
        ################################
        n_symbols=len(symbols),
        symbols_embedding_dim=512,

        # Encoder parameters
        encoder_kernel_size=5,
        encoder_n_convolutions=3,
        encoder_embedding_dim=512,

        # Decoder parameters
        n_frames_per_step=1,  # currently only 1 is supported
        decoder_rnn_dim=1024,
        prenet_dim=256,
        max_decoder_steps=1000,
        gate_threshold=0.5,
        p_attention_dropout=0.1,
        p_decoder_dropout=0.1,

        # Attention parameters
        attention_rnn_dim=1024,
        attention_dim=128,

        # Location Layer parameters
        attention_location_n_filters=32,
        attention_location_kernel_size=31,

        # Mel-post processing network parameters
        postnet_embedding_dim=512,
        postnet_kernel_size=5,
        postnet_n_convolutions=5,

        ################################
        # Optimization Hyperparameters #
        ################################
        use_saved_learning_rate=False,
        learning_rate=1e-3,
        weight_decay=1e-6,
        grad_clip_thresh=1.0,
        batch_size=16,
        mask_padding=True  # set model's padded outputs to padded values
    )

    if hparams_string:
        tf.logging.info('Parsing command line hparams: %s', hparams_string)
        hparams.parse(hparams_string)

    if verbose:
        tf.logging.info('Final parsed hparams: %s', hparams.values())

    return hparams

## Download the dataset

**Warning**: the download takes a long time (approx. 1.5-2h)

In [None]:
!wget https://datashare.ed.ac.uk/download/DS_10283_2651.zip
!unzip DS_10283_2651.zip -d vctk/
!unzip vctk/VCTK-Corpus.zip -d vctk/

--2022-03-09 19:11:10--  https://datashare.ed.ac.uk/download/DS_10283_2651.zip
Resolving datashare.ed.ac.uk (datashare.ed.ac.uk)... 192.41.117.26
Connecting to datashare.ed.ac.uk (datashare.ed.ac.uk)|192.41.117.26|:443... connected.
HTTP request sent, awaiting response... 200 200
Length: 11166618959 (10G) [application/zip]
Saving to: ‘DS_10283_2651.zip’

DS_10283_2651.zip     1%[                    ] 153.14M  1.73MB/s    eta 1h 42m ^C


Create a train-valid-test split (10 speakers for the test set, the remaining utterances are divided 80:20 into the train and valid sets)

In [None]:
!/content/tacotron2/prepare_vctk_train_valid_split.sh /content/vctk/VCTK-Corpus/ /content/tacotron2/filelists/vctk_filelist.txt
!/content/tacotron2/prepare_vctk_test.sh /content/vctk/VCTK-Corpus/ /content/tacotron2/filelists/vctk_filelist_test.txt

realpath: /content/vctk/VCTK-Corpus/: No such file or directory
ls: cannot access '/wav48': No such file or directory
realpath: /content/vctk/VCTK-Corpus/: No such file or directory
ls: cannot access '/wav48': No such file or directory


The dataloader can calculate utterance embeddings on the fly, but keep in mind that this *significantly* slows down the training process. If instead you want to precalculate the utterance (or speaker) embeddings, execute the cells below.

### Calculate utterance embeddings (recommended)

In [None]:
!mkdir /content/utt_embeds

import sys
sys.path.append('/content/tacotron2/')
from resemblyzer.audio import preprocess_wav, trim_long_silences
from resemblyzer import VoiceEncoder
from utils import load_wav_to_torch, load_filepaths_and_text
from tqdm import tqdm
import os

OUTPUT_DIRECTORY="/content/utt_embeds"

hparams = create_hparams(None, True)
    
speaker_encoder = VoiceEncoder().cuda()
train_paths = load_filepaths_and_text(hparams.training_files)
valid_paths = load_filepaths_and_text(hparams.validation_files)
newlines = []

print("Training files:")
for filename, _ in tqdm(train_paths):
    audio = trim_long_silences(preprocess_wav(filename))
    embed = speaker_encoder.embed_utterance(audio)
    embed_path = os.path.join(OUTPUT_DIRECTORY, os.path.basename(filename).rsplit('.', 1)[0])
    torch.save(embed, embed_path)
    newlines.append(f"{filename}|{embed_path}")

print("Validation files:")
for filename, _ in tqdm(valid_paths):
    audio = trim_long_silences(preprocess_wav(filename))
    embed = speaker_encoder.embed_utterance(audio)
    embed_path = os.path.join(OUTPUT_DIRECTORY, os.path.basename(filename).rsplit('.', 1)[0])
    torch.save(embed, embed_path)
    newlines.append(f"{filename}|{embed_path}")

with open('/content/tacotron2/filelists/vctk_embeddings.txt', 'w') as f:
    f.write('\n'.join(newlines))

### Calculate speaker embeddings

In [None]:
!mkdir /content/speaker_embeds

import sys
sys.path.append('/content/tacotron2/')
from resemblyzer.audio import preprocess_wav, trim_long_silences
from resemblyzer import VoiceEncoder
from utils import load_wav_to_torch, load_filepaths_and_text
from tqdm import tqdm
from pathlib import Path
import numpy as np
import os

OUTPUT_DIRECTORY="/content/speaker_embeds"

hparams = create_hparams(None, True)
    
speaker_encoder = VoiceEncoder().cuda()
train_paths = load_filepaths_and_text(hparams.training_files)
valid_paths = load_filepaths_and_text(hparams.validation_files)
newlines = []

vctk_path = Path('/content/vctk/VCTK-Corpus/')
speakers_path = vctk_path / 'wav48'
for item in tqdm(speakers_path.iterdir()):
    if item.is_dir():
        utts = np.random.choice(list(item.glob('**/*.wav')), size=10, replace=False)
        wavs = np.array([preprocess_wav(filename) for filename in utts])
        embed = speaker_encoder.embed_speaker(wavs)
        torch.save(embed, os.path.join(OUTPUT_DIRECTORY, item.name))

for dataset in [train_paths, valid_paths]:
    for filename, _ in tqdm(dataset):
        speaker_id = filename.rsplit('/', 1)[1].split('_')[0]
        newlines.append(f"{filename}|{OUTPUT_DIRECTORY}/{speaker_id}")

with open('/content/tacotron2/filelists/vctk_speaker_embeds.txt', 'w') as f:
    f.write('\n'.join(newlines))

## Download the vocoder and speaker encoder

In [None]:
!mkdir /root/tacotron2/models
!gdown --id 1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF -O /root/tacotron2/models/waveglow_256channels_universal_v5.pt
!gdown --id 1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA -O /root/tacotron2/models/tacotron2_statedict.pt

Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1rpK8CzAAirq9sWZhe9nlfvxMF1dRgFbF 



## Synthesizer (Tacotron 2)

In [None]:
import sys
sys.path.append('/content/tacotron2/')
from math import sqrt
import torch
from torch.autograd import Variable
from torch import nn
from torch.nn import functional as F
from layers import ConvNorm, LinearNorm
from utils import to_gpu, get_mask_from_lengths
from model import Prenet, Attention, Encoder, Postnet, Decoder

class MultiSpeakerTacotron2(nn.Module):
    def __init__(self, hparams):
        super(MultiSpeakerTacotron2, self).__init__()
        self.mask_padding = hparams.mask_padding
        self.fp16_run = hparams.fp16_run
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.embedding = nn.Embedding(
            hparams.n_symbols, hparams.symbols_embedding_dim)
        std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
        val = sqrt(3.0) * std  # uniform bounds for std
        self.embedding.weight.data.uniform_(-val, val)
        self.encoder = Encoder(hparams)
        self.decoder = Decoder(hparams)
        self.postnet = Postnet(hparams)
        
        self.predecoder_projection = nn.Linear(768, 512)

    def parse_batch(self, batch):
        text_padded, input_lengths, mel_padded, gate_padded, \
            output_lengths, embeds = batch
        text_padded = to_gpu(text_padded).long()
        input_lengths = to_gpu(input_lengths).long()
        max_len = torch.max(input_lengths.data).item()
        mel_padded = to_gpu(mel_padded).float()
        gate_padded = to_gpu(gate_padded).float()
        output_lengths = to_gpu(output_lengths).long()

        return (
            (text_padded, input_lengths, mel_padded, max_len, output_lengths),
            (mel_padded, gate_padded), embeds)

    def parse_output(self, outputs, output_lengths=None):
        if self.mask_padding and output_lengths is not None:
            mask = ~get_mask_from_lengths(output_lengths)
            mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1))
            mask = mask.permute(1, 0, 2)

            outputs[0].data.masked_fill_(mask, 0.0)
            outputs[1].data.masked_fill_(mask, 0.0)
            outputs[2].data.masked_fill_(mask[:, 0, :], 1e3)  # gate energies

        return outputs

    def forward(self, inputs, wavs=None):
        text_inputs, text_lengths, mels, max_len, output_lengths = inputs
        text_lengths, output_lengths = text_lengths.data, output_lengths.data

        embedded_inputs = self.embedding(text_inputs).transpose(1, 2)
        encoder_outputs = self.encoder(embedded_inputs, text_lengths)
        
        # Add speaker embeddings to the memory
        bs = encoder_outputs.size(0)
        num_chars = encoder_outputs.size(1)
        wavs_size = wavs.size(1)
        
        embeds = wavs.repeat_interleave(num_chars, dim=1)
        embeds = embeds.reshape(bs, wavs_size, num_chars)
        embeds = embeds.transpose(1, 2)
        
        encoder_outputs = torch.cat((encoder_outputs, embeds), 2)
        encoder_outputs = self.predecoder_projection(encoder_outputs)

        mel_outputs, gate_outputs, alignments = self.decoder(
            encoder_outputs, mels, memory_lengths=text_lengths)
        
        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments],
            output_lengths)

    def inference(self, inputs, wavs=None):
        embedded_inputs = self.embedding(inputs).transpose(1, 2)
        encoder_outputs = self.encoder.inference(embedded_inputs)
        
        # Add speaker embeddings to the memory
        bs = encoder_outputs.size(0)
        num_chars = encoder_outputs.size(1)
        wavs_size = wavs.size(1)
        
        embeds = wavs.repeat_interleave(num_chars, dim=1)
        embeds = embeds.reshape(bs, wavs_size, num_chars)
        embeds = embeds.transpose(1, 2)
        
        encoder_outputs = torch.cat((encoder_outputs, embeds), 2)
        encoder_outputs = self.predecoder_projection(encoder_outputs)
        
        mel_outputs, gate_outputs, alignments = self.decoder.inference(
            encoder_outputs)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        outputs = self.parse_output(
            [mel_outputs, mel_outputs_postnet, gate_outputs, alignments])

        return outputs

## Training loop

In [None]:
import sys
sys.path.append('/content/tacotron2/')
import os
import time
import argparse
import math
from numpy import finfo

import torch
from torch._C import device
from torch.nn.modules.loss import BCELoss
from tacotron2.distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

from model import Tacotron2
from model_multi_tts import MultiSpeakerPostnet, MultiSpeakerTacotron2, MultiSpeakerDecoder
from data_utils import TextMelEmbedLoader, TextMelEmbedCollate
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger

def reduce_tensor(tensor, n_gpus):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= n_gpus
    return rt


def init_distributed(hparams, n_gpus, rank, group_name):
    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
    print("Initializing Distributed")

    # Set cuda device so everything is done on the right GPU.
    torch.cuda.set_device(rank % torch.cuda.device_count())

    # Initialize distributed communication
    dist.init_process_group(
        backend=hparams.dist_backend, init_method=hparams.dist_url,
        world_size=n_gpus, rank=rank, group_name=group_name)

    print("Done initializing distributed")


def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    trainset = TextMelEmbedLoader(hparams.training_files, hparams)
    valset = TextMelEmbedLoader(hparams.validation_files, hparams)
    collate_fn = TextMelEmbedCollate(hparams.n_frames_per_step)

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle,
                              sampler=train_sampler,
                              batch_size=hparams.batch_size, pin_memory=False,
                              drop_last=True, collate_fn=collate_fn)
    return train_loader, valset, collate_fn


def prepare_directories_and_logger(output_directory, log_directory, rank):
    if rank == 0:
        if not os.path.isdir(output_directory):
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775)
        logger = Tacotron2Logger(os.path.join(output_directory, log_directory))
    else:
        logger = None
    return logger


def load_model(hparams):
    model = MultiSpeakerTacotron2(hparams).cuda()
    if hparams.fp16_run:
        model.decoder.attention_layer.score_mask_value = finfo('float16').min

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    return model


def warm_start_model(checkpoint_path, model, ignore_layers):
    assert os.path.isfile(checkpoint_path)
    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    checkpoint_model_dict = checkpoint_dict['state_dict']
    if len(ignore_layers) > 0:
        model_dict = model.state_dict()
        checkpoint_model_dict = {k: v for k, v in checkpoint_model_dict.items()
                      if k not in ignore_layers and k in model_dict}
        print(f'Checkpoint dictionary length: {len(checkpoint_model_dict)}')
        dummy_dict = model.state_dict()
        dummy_dict.update(checkpoint_model_dict)
        checkpoint_model_dict = dummy_dict
    model.load_state_dict(checkpoint_model_dict)
    return model


def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path)
    print("Loading checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint_dict['state_dict'])
    optimizer.load_state_dict(checkpoint_dict['optimizer'])
    learning_rate = checkpoint_dict['learning_rate']
    iteration = checkpoint_dict['iteration']
    print("Loaded checkpoint '{}' from iteration {}" .format(
        checkpoint_path, iteration))
    return model, optimizer, learning_rate, iteration


def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    torch.save({'iteration': iteration,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)


def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, collate_fn=collate_fn)

        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            x, y, embeds = model.parse_batch(batch)
            embeds = embeds.cuda()
            
            y_pred = model(x, wavs=embeds)
            loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)

    model.train()
    if rank == 0:
        print("Validation loss {}: {:9f}  ".format(iteration, val_loss))
        logger.log_validation(val_loss, model, y, y_pred, iteration)
    return val_loss

def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout
    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    model.cuda()
    if warm_start:
        optimizer = torch.optim.Adam([
            {"params": model.decoder.parameters()},
            {"params": model.postnet.parameters()},
            {"params": model.predecoder_projection.parameters()},
            {"params": model.encoder.parameters(), "lr": learning_rate / 2}
        ], lr=learning_rate, weight_decay=hparams.weight_decay)
    else:
        optimizer = torch.optim.Adam(model.parameters(), 
                                     lr=learning_rate, 
                                     weight_decay=hparams.weight_decay)
    if checkpoint_path is not None:
        if warm_start:
            print("WARM START")
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            #model.decoder = MultiSpeakerDecoder(hparams).cuda()
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))
    
    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    model.train()
    is_overflow = False
    print(model)
    
    print(warm_start)
    print(iteration)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, threshold=1e-2)
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            # print(batch)
            start = time.perf_counter()
            # bs = batch.size[0]
            x, y, embeds = model.parse_batch(batch)
            embeds = embeds.cuda()

            y_pred = model(x, wavs=embeds)
            loss = criterion(y_pred, y)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                    iteration, reduced_loss, grad_norm, duration))
                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
                val_loss = validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

                scheduler.step(val_loss)

            iteration += 1


In [None]:
HPARAMS = None
OUTPUT_DIR = "/root/tacotron2/output"
LOG_DIR = "logdir"
# CHECKPOINT_PATH = "/root/tacotron2/models/tacotron2_statedict.pt"
CHECKPOINT_PATH = None
WARM_START = True


hparams = create_hparams(HPARAMS, True)

# torch.multiprocessing.set_start_method('spawn')# good solution !!!!
# torch.backends.cudnn.enabled = hparams.cudnn_enabled
# torch.backends.cudnn.benchmark = hparams.cudnn_benchmark

print("FP16 Run:", hparams.fp16_run)
print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling)
print("Distributed Run:", hparams.distributed_run)
print("cuDNN Enabled:", hparams.cudnn_enabled)
print("cuDNN Benchmark:", hparams.cudnn_benchmark)

train(OUTPUT_DIR, LOG_DIR, CHECKPOINT_PATH if CHECKPOINT_PATH else None,
      WARM_START, 1, 0, "group_name", hparams)

# Results

You can download the pretrained synthesizer [here](https://drive.google.com/drive/folders/1UYSR7FUaN0aMrgw4hi6_kJScC1qtJots?usp=sharing)

In [None]:
import gdown
demo_examples = gdown.download_folder('https://drive.google.com/drive/folders/1UYSR7FUaN0aMrgw4hi6_kJScC1qtJots?usp=sharing')

In [None]:
from IPython.display import Audio, display

for sound_file in demo_examples:
    print(sound_file)
    display(Audio(sound_file, autoplay=False))

/content/examples/p285_valid_bad.wav


/content/examples/p294_valid.wav


/content/examples/p312_test.wav


/content/examples/p334_valid.wav


/content/examples/p345_test_2.wav


/content/examples/p345_test.wav


/content/examples/p347_test.wav


/content/examples/p361_test.wav


/content/examples/p362_test.wav


/content/examples/p363_test.wav


/content/examples/p374_test_bad.wav


/content/examples/p374_test.wav
