In [1]:

import json
import os
import torch
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False
from torch.cuda import amp
from torch.utils.tensorboard import SummaryWriter
%load_ext autoreload
%autoreload 2 
# import radtts
import os
import json
from timeit import default_timer as timer
os.chdir('/usr/src/app/radtts')
from distributed import (init_distributed, apply_gradient_allreduce,
                         reduce_tensor)

from radtts import RADTTS
from train import RADTTSLoss, AttentionBinarizationLoss, prepare_dataloaders, prepare_model_weights, parse_data_from_batch, compute_validation_loss
import torch
# from radtts.inference import load_vocoder
from hifigan_env import AttrDict
from data import Data
from hifigan_models import Generator
from hifigan_env import AttrDict
from hifigan_denoiser import Denoiser
from radam import RAdam
torch.cuda.set_device(1)

# Needs to be here cause of implicit "config" argument
def prepare_output_folders_and_logger(output_directory):
    # Get shared output_directory ready
    if not os.path.isdir(output_directory):
        os.makedirs(output_directory)
        os.chmod(output_directory, 0o775)
        print("output directory", output_directory)

    output_config_path = os.path.join(output_directory, 'config.json')
    print("saving current configuration in output dir")
    config_fp = open(output_config_path, 'w')
    json.dump(config, config_fp, indent=4)
    config_fp.close()
    output_code_path = os.path.join(output_directory, 'code.tar.gz')
    os.system('tar -czvf %s *.py' % (output_code_path))

    tboard_out_path = os.path.join(output_directory, 'logs')
    print("setting up tboard log in %s" % (tboard_out_path))
    logger = SummaryWriter(tboard_out_path)
    return logger



In [2]:
config_path = '/usr/src/app/radtts/configs/2_22_23.json'
with open(config_path) as f:
    config = json.load(f)
seed = 1234

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
n_gpus = 1
train_config = config['train_config']
sigma = train_config['sigma']
model_config = config['model_config']
loss_weights = train_config['loss_weights']
learning_rate = train_config['learning_rate']
unfreeze_modules = train_config['unfreeze_modules']
weight_decay = train_config['weight_decay']
output_directory = train_config['output_directory']
use_amp = train_config['use_amp']
batch_size = 4
iters_per_checkpoint = train_config['iters_per_checkpoint']
grad_clip_val = train_config['grad_clip_val']
data_config = config['data_config']
epochs = train_config['epochs']
binarization_start_iter = train_config['binarization_start_iter']
criterion = RADTTSLoss(
    sigma,
    model_config['n_group_size'],
    model_config['dur_model_config'],
    model_config['f0_model_config'],
    model_config['energy_model_config'],
    vpred_model_config=model_config['v_model_config'],
    loss_weights=loss_weights
)
attention_kl_loss = AttentionBinarizationLoss()
model = RADTTS(**model_config).cuda()
rank = 0


Applying spectral norm to text encoder LSTM
Applying spectral norm to context encoder LSTM


The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2349.)
  W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
LU, pivots = torch.lu(A, compute_pivots)
should be replaced with
LU, pivots = torch.linalg.lu_factor(A, compute_pivots)
and
LU, pivots, info = torch.lu(A, compute_pivots, get_infos=True)
should be replaced with
LU, pivots, info = torch.linalg.lu_factor_ex(A, compute_pivots) (Triggered internally at ../aten/src/ATen/native/BatchLinearAlgebra.cpp:1915.)
  return torch._lu_with_info(A, pivot=pivot, check_errors=(not get_infos))


In [3]:
iteration = 0
optimizer = RAdam(model.parameters(), lr=learning_rate,
                      weight_decay=weight_decay)

scaler = amp.GradScaler(enabled=use_amp)

for param_group in optimizer.param_groups:
    param_group['lr'] = learning_rate

train_loader, valset, collate_fn = prepare_dataloaders(
    data_config, n_gpus, batch_size)

logger = prepare_output_folders_and_logger(output_directory)

prepare_model_weights(model, unfreeze_modules)
model.train()

epoch_offset = 0

initializing training dataloader
Number of speakers: 28
Speaker IDS {'0': 0, '1': 1, '10': 2, '11': 3, '12': 4, '13': 5, '14': 6, '15': 7, '16': 8, '17': 9, '19': 10, '2': 11, '20': 12, '21': 13, '22': 14, '23': 15, '24': 16, '25': 17, '27': 18, '28': 19, '29': 20, '3': 21, '4': 22, '5': 23, '6': 24, '7': 25, '8': 26, '9': 27}
Number of files 21223
Number of files after duration filtering 21223
Dataloader initialized with no augmentations
initializing validation dataloader
Number of files 21223
Number of files after duration filtering 21223
Dataloader initialized with no augmentations
saving current configuration in output dir
setting up tboard log in /usr/src/app/radtts/outputs/2_22_23/logs
Training everything


In [4]:
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, epochs):
    print("Epoch: {}".format(epoch))
    for i, batch in enumerate(train_loader):
        print(i)
        print(batch['audiopaths'])
        tic = timer()
        model.zero_grad()
        (mel, speaker_ids, text, in_lens, out_lens, attn_prior,
         f0, voiced_mask, p_voiced, energy_avg,
         audiopaths) = parse_data_from_batch(batch)

        if iteration >= binarization_start_iter:
            binarize = True   # binarization training phase
        else:
            binarize = False  # no binarization, soft alignments only

        with amp.autocast(use_amp):
            
            outputs = model(
                mel, speaker_ids, text, in_lens, out_lens,
                binarize_attention=binarize, attn_prior=attn_prior,
                f0=f0, energy_avg=energy_avg,
                voiced_mask=voiced_mask, p_voiced=p_voiced)
            print(text)
            loss_outputs = criterion(outputs, in_lens, out_lens)

            loss = None
            for k, (v, w) in loss_outputs.items():
                if w > 0:
                    loss = v * w if loss is None else loss + v * w

            w_bin = criterion.loss_weights.get('binarization_loss_weight', 1.0)
            if binarize and iteration >= kl_loss_start_iter:
                binarization_loss = attention_kl_loss(
                    outputs['attn'], outputs['attn_soft'])
                loss += binarization_loss * w_bin
            else:
                binarization_loss = torch.zeros_like(loss)
            loss_outputs['binarization_loss'] = (binarization_loss, w_bin)

        scaler.scale(loss).backward()
        if grad_clip_val > 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(
                model.parameters(), grad_clip_val)
        scaler.step(optimizer)
        scaler.update()

        toc = timer()
        current_lr = optimizer.param_groups[0]['lr']
        print_list = ["iter: {}  ({:.2f} s)  |  lr: {}".format(
            iteration, toc-tic, current_lr)]

        for k, (v, w) in loss_outputs.items():
            reduced_v = reduce_tensor(v, n_gpus, 0).item()
            loss_outputs[k] = reduced_v
            if rank == 0:
                print_list.append('  |  {}: {:.3f}'.format(k, v))
                logger.add_scalar('train/'+k, reduced_v, iteration)

        if rank == 0:
            print(''.join(print_list), flush=True)

        if iteration > -1 and iteration % iters_per_checkpoint == 0:

            val_loss_outputs = compute_validation_loss(
                iteration, model, criterion, valset, collate_fn,
                batch_size, n_gpus, logger=logger,
                train_config=train_config)
            checkpoint_path = "{}/model_{}".format(
                output_directory, iteration)
            save_checkpoint(model, optimizer, learning_rate, iteration,
                            checkpoint_path)
            print('Validation loss:', val_loss_outputs)


        iteration += 1

# train(n_gpus, rank, **train_config)

Epoch: 0
saving f0 to data_cache/JRodriguesOldTimerDataset_wavs_82_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ017-0027_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ033-0124_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ008-0152_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ028-0241_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ018-0346_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/jr_base_wavs_698_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ008-0211_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.p

  result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,


tensor([[  8, 125, 102, 167, 156, 131, 170,   8,  79,  93,  94,  83,  87,  75,
          94,  79,  78,   8,  94,  82,  75,  94,   8, 110, 166, 167, 103, 153,
         168, 110, 155, 110, 170, 154, 149,   8, 171, 167, 150,   8, 155, 146,
         156, 110, 170, 168,   8, 145, 154, 107, 166, 168, 170,   8,  76,  79,
          94,  97,  79,  79,  88,   8,  94,  82,  79,   8, 170, 123, 155,   8,
         143, 150,   8, 143, 135, 127,   8,  94,  82,  79,   8, 154, 107, 168,
         170,   8,  89,  80,   8,  94,  82,  79,   8, 169, 103, 170, 168,   8,
          75,  88,  78,   8,  94,  82,  79,   8, 170, 123, 155,   8, 143, 150,
           8,  93,  94,  75,  92,  94,  79,  78,   8, 142, 103, 167, 127, 145,
         157,   8,  94,  82,  79,   8, 141, 167, 111, 156, 170,   8, 127, 115,
         167,   4,   8],
        [  8, 170, 131, 168, 170, 145, 141, 124, 127,   8,  94,  82,  75,  94,
           8, 170, 181, 131, 156, 170, 149,   8, 122, 127, 131, 156, 170, 110,
         141, 124, 110, 125

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1420.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


saving f0 to data_cache/dataset_wavs_99_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ046-0178_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ016-0111_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ033-0176_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/Shrek_Data_wavs_304_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ004-0155_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/LJSpeech-1.1_wavs_LJ015-0074_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_25_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_29_22k

  result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,


saving f0 to data_cache/TAG_wavs_63_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.ptsaving f0 to data_cache/TAG_wavs_39_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt

saving f0 to data_cache/jay-z_wavs_jayz-4_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_48_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_60_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_34_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_43_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_55_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_77_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cach

saving f0 to data_cache/TAG_wavs_119_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.ptsaving f0 to data_cache/TAG_wavs_112_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt

saving f0 to data_cache/TAG_wavs_139_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_128_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_134_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_124_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_115_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_135_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_141_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_c

saving f0 to data_cache/TAG_wavs_202_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_199_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_163_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_219_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_209_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_214_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_222_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_206_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/TAG_wavs_203_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_c

saving f0 to data_cache/TAG_wavs_228_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_21_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_35_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_26_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_12_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_29_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_34_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_36_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_06_22k_normalized.wav_f0_sr22050_fl1024_hl256_

saving f0 to data_cache/eminem_wav22050_138_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_134_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_124_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_128_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_120_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_109_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_80_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_72_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_139_22k_normalized.wav_f0_sr22050_

saving f0 to data_cache/eminem_wav22050_205_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_213_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_149_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_190_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_200_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_198_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_150_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_191_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_206_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/eminem_wav22050_266_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_270_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_253_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_316_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_276_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_290_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_294_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_267_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_280_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/eminem_wav22050_360_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_346_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_335_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_382_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_321_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_363_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_386_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_366_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_341_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/eminem_wav22050_431_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_458_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_464_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_415_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_392_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_440_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_436_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_459_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_432_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/eminem_wav22050_521_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_471_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_491_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_511_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_512_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_527_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_541_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_504_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_522_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/eminem_wav22050_596_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_589_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_581_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_624_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_603_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_616_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_609_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_590_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_600_22k_normalized.wav_f0_sr2205

saving f0 to data_cache/big-gay-rapping_wavs_56_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_12_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_51_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_48_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_36_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/eminem_wav22050_629_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_52_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_42_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs

saving f0 to data_cache/big-gay-rapping_wavs_96_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_90_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_51_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_10_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_43_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_16_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_5_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/big-gay-rapping_wavs_98_22k_normalized.wav_f0_sr22050_fl1024_hl256_f0min80.0_f0max640.0_log0.pt
saving f0 to data_cache/sam-lachow_wavs_52_22k_normalized.wav_f0_s

RuntimeError: cuDNN error: CUDNN_STATUS_MAPPING_ERROR

In [5]:
batch['audiopaths']

['/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ035-0176_22k_normalized.wav',
 '/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ034-0022_22k_normalized.wav',
 '/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ033-0124_22k_normalized.wav',
 '/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ037-0142_22k_normalized.wav']

In [46]:
            outputs = model(
                mel, speaker_ids, text, in_lens, out_lens,
                binarize_attention=binarize, attn_prior=attn_prior,
                f0=f0, energy_avg=energy_avg,
                voiced_mask=voiced_mask, p_voiced=p_voiced)

> [0;32m/usr/src/app/radtts/radtts.py[0m(376)[0;36mforward[0;34m()[0m
[0;32m    374 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    375 [0;31m        [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 376 [0;31m        [0mtext_enc[0m[0;34m,[0m [0mtext_embeddings[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mencode_text[0m[0;34m([0m[0mtext[0m[0;34m,[0m [0min_lens[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    377 [0;31m[0;34m[0m[0m
[0m[0;32m    378 [0;31m        [0mlog_s_list[0m[0;34m,[0m [0mlog_det_W_list[0m[0;34m,[0m [0mz_mel[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> text_enc, text_embeddings = self.encode_text(text, in_lens)
*** RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR
ipdb> text
tensor([[         0,          0,          0, 

--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user


RuntimeError: cuDNN error: CUDNN_STATUS_INTERNAL_ERROR

In [48]:
            (mel, speaker_ids, text, in_lens, out_lens, attn_prior,
             f0, voiced_mask, p_voiced, energy_avg,
             audiopaths) = parse_data_from_batch(batch)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [51]:
import os
[os.path.getsize(filepath) for filepath in batch['audiopaths']]
# attn_prior

RuntimeError: numel: integer multiplication overflow

In [21]:
batch.keys()

dict_keys(['mel', 'speaker_ids', 'text', 'input_lengths', 'output_lengths', 'audiopaths', 'attn_prior', 'f0', 'p_voiced', 'voiced_mask', 'energy_avg'])

In [42]:
batch['audiopaths']

['/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ033-0161_22k_normalized.wav',
 '/usr/src/app/radtts/data/lj_data/LJSpeech-1.1/wavs/LJ019-0312_22k_normalized.wav',
 '/usr/src/app/radtts/data/eminem/wav22050/261_22k_normalized.wav',
 '/usr/src/app/radtts/data/Carolyn_Singing/wavs/261_22k_normalized.wav']

In [41]:
batch['text']

tensor([[  8,  63,  88,   8,  94,  82,  79,   8, 127, 139,   8,  89,  80,   8,
          94,  82,  79,   8, 110, 168, 108, 168, 110, 156, 139, 169, 110, 156,
           3,   8,  94,  82,  79,   8, 127, 107, 154, 110, 168,   8, 166, 110,
         154, 150, 168,   8, 110, 125, 170, 139, 156, 127,   8,  75,   8, 168,
         107, 155, 166, 110, 154,   8,  89,  80,   8, 167, 107, 166, 145, 157,
           8, 166, 139, 166, 134,   8,  75,  88,  78,   8, 170, 139, 166,   8],
        [  8,  97,  83,  94,  82,   8,  94,  82,  79,   8, 154, 160, 153, 110,
         154,   8, 152, 175, 167, 145, 168, 127, 146, 153, 169, 110, 156, 183,
           3,   8, 116, 154, 128, 160,   8, 168, 170, 146, 154,   8, 180, 131,
         167, 149,   8, 154, 150, 156, 182, 110, 156, 170, 154, 149,   8, 127,
         145, 168, 166, 160, 183, 127,   4,   8,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
        [  8, 123, 154,   8, 166, 167, 103, 125, 1

In [18]:
text  = batch['text'].cuda()
in_lens = batch['input_lengths'].cuda()

In [None]:
input_lengths, ids_sorted_decreasing = torch.sort(
    torch.LongTensor([len(x["text_encoded"]) for x in batch]),
    dim=0,
    descending=True,
)