<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
# default_exp utils.utils

In [None]:
# export

import sys
import os
import soundfile as sf
import pandas as pd

import numpy as np

import soundfile as sf
import librosa


def load_filepaths_and_text(filename: str, split: str = "|"):
    with open(filename, encoding="utf-8") as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    return filepaths_and_text


def synthesize_speakerids2(filelists, fix_indices_index=None):

    data_dict = {}
    data_dict_out = {}
    for f in range(len(filelists)):
        data = load_filepaths_and_text(filelists[f])
        data_dict[filelists[f]] = pd.DataFrame(data)

    source_files = list(data_dict.keys())

    speaker_offset = {}
    nfilelist = len(filelists)
    reserved_speakers = np.unique(data_dict[filelists[fix_indices_index]].iloc[:, 2])

    for s in range(nfilelist):
        source_file = filelists[s]
        data = data_dict[source_file]
        if s != fix_indices_index:
            speakers = np.unique(data.iloc[:, 2])
            overlap = np.where(np.isin(speakers, reserved_speakers))[0]
            reserved_speakers_temp = np.union1d(speakers, reserved_speakers)
            newindices = np.setdiff1d(
                list(range(len(reserved_speakers) + len(speakers))),
                reserved_speakers_temp,
            )[: len(overlap)]
            for o in range(len(overlap)):
                data.iloc[np.where(data.iloc[:, 2] == overlap[o])[0], 2] = newindices[o]

            data_dict_out[source_file] = data
            speakers = np.unique(data.iloc[:, 2])
            reserved_speakers = np.union1d(speakers, reserved_speakers)
        else:
            data_dict_out[source_file] = data
    return data_dict_out


def parse_vctk(root: str):
    """Parse VCTK dataset and return a dict representation."""
    wav_dir = os.path.join(root, "wav48_silence_trimmed")
    txt_dir = os.path.join(root, "txt")
    speaker_wavs = os.listdir(wav_dir)
    speaker_txts = os.listdir(txt_dir)
    speakers = list(set(speaker_wavs) & set(speaker_txts))
    output_dict = {}
    for speaker in speakers:
        speaker_wav_dir = os.path.join(wav_dir, speaker)
        speaker_txt_dir = os.path.join(txt_dir, speaker)
        wav_files_speaker = np.asarray(os.listdir(speaker_wav_dir))
        txt_files_speaker = np.asarray(os.listdir(speaker_txt_dir))

        nwavfiles = len(wav_files_speaker)

        transcription_basenames = np.asarray([t[:8] for t in txt_files_speaker])
        audio_basenames = np.asarray([w[:8] for w in wav_files_speaker])
        mic = np.asarray([w[12] for w in wav_files_speaker])
        mic1_ind = mic == "1"
        wav_files_speaker = wav_files_speaker[mic1_ind]
        audio_basenames = audio_basenames[mic1_ind]

        combined_files = np.intersect1d(transcription_basenames, audio_basenames)
        matching_inds1 = np.where(np.isin(transcription_basenames, combined_files))[0]
        matching_inds2 = np.where(np.isin(audio_basenames, combined_files))[0]
        inds1 = matching_inds1[transcription_basenames[matching_inds1].argsort()]
        inds2 = matching_inds2[audio_basenames[matching_inds2].argsort()]
        txt_files_speaker = txt_files_speaker[inds1]
        wav_files_speaker = wav_files_speaker[inds2]
        texts, wavs = [], []
        for text_basename, wav_basename in zip(txt_files_speaker, wav_files_speaker):
            text_file = os.path.join(speaker_txt_dir, text_basename)
            with open(text_file) as f:
                contents = f.read().strip("\n")
            texts.append(contents)
            wav_file = os.path.join(speaker_wav_dir, wav_basename)
            wavs.append(wav_file)

        if len(wavs):
            output_dict[speaker] = list(zip(texts, wavs))
    return output_dict


def parse_libritts_mellotron(source_folder, mellotron_filelist):
    data = pd.read_csv(mellotron_filelist, sep="|", header=None, error_bad_lines=False)

    data[0] = data[0].str[17:]

    data[0] = source_folder + data[0].astype(str)
    return data


def add_speakerid(data, speaker_key=0):
    if data.shape[1] == 3:
        if type(data[2]) == int:
            pass
        else:
            speaker_ids = np.asarray(
                np.ones(data.shape[0], dtype=int) * speaker_key, dtype=int
            )
            data[2] = speaker_ids
    if data.shape[1] == 2:
        speaker_ids = np.asarray(
            np.ones(data.shape[0], dtype=int) * speaker_key, dtype=int
        )
        data[2] = speaker_ids

    return data


def parse_libritts_mellotron(source_folder, mellotron_filelist):
    data = load_filepaths_and_text(mellotron_filelist)
    data = pd.DataFrame(data)
    data[0] = data[0].str[17:]

    data[0] = source_folder + data[0].astype(str)
    return data


def parse_uberduck(source_folder):
    source_file = source_folder + "/all.txt"
    data = load_filepaths_and_text(source_file)
    data = pd.DataFrame(data)

    nsamp = data.shape[0]
    data[0] = source_folder + "/" + data[0].astype(str)
    output = add_speakerid(data, speaker_key=0)

    for i in range(output.shape[0]):
        loaded = librosa.load(output.iloc[i, 0])
        sf.write(output.iloc[i, 0], loaded[0], loaded[1])

    return output


def parse_ljspeech(source_folder):
    source_file = source_folder + "/metadata.csv"
    data = load_filepaths_and_text(source_file)
    data = pd.DataFrame(data)
    nsamp = data.shape[0]

    data[0] = source_folder + "/wavs/" + data[0].astype(str)
    output = add_speakerid(data, speaker_key=0)
    for i in range(output.shape[0]):
        output.iloc[i, 0] = output.iloc[i, 0] + ".wav"

    return output


def get_alignment_metrics(alignments, average_across_batch=True):
    """See https://github.com/NVIDIA/tacotron2/pull/284,
    https://github.com/CookiePPP/cookietts/blob/c871f5f7b5790656d5b57bcd9e63946a2da52f0f/CookieTTS/utils/model/utils.py#L59"""
    alignments = alignments.transpose(1, 2)  # [B, dec, enc] -> [B, enc, dec]
    input_lengths = torch.ones(alignments.size(0), device=alignments.device) * (
        alignments.shape[1] - 1
    )  # [B]
    output_lengths = torch.ones(alignments.size(0), device=alignments.device) * (
        alignments.shape[2] - 1
    )  # [B]
    batch_size = alignments.size(0)
    optimums = torch.sqrt(
        input_lengths.double().pow(2) + output_lengths.double().pow(2)
    ).view(batch_size)

    # [B, enc, dec] -> [B, dec], [B, dec]
    values, cur_idxs = torch.max(alignments, 1)

    cur_idxs = cur_idxs.float()
    prev_indx = torch.cat((cur_idxs[:, 0][:, None], cur_idxs[:, :-1]), dim=1)
    dist = ((prev_indx - cur_idxs).pow(2) + 1).pow(0.5)  # [B, dec]
    dist.masked_fill_(
        ~get_mask_from_lengths(output_lengths, max_len=dist.size(1)), 0.0
    )  # set dist of padded to zero
    dist = dist.sum(dim=(1))  # get total dist for each B
    diagonalness = (dist + 1.4142135) / optimums  # dist / optimal dist

    maxes = alignments.max(axis=1)[0].mean(axis=1)
    if average_across_batch:
        diagonalness = diagonalness.mean()
        max_ = maxes.mean()

    output = {}
    output["diagonalness"] = diagonalness
    output["max"] = max_
    return output

In [None]:
# export

import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util


def window_sumsquare(
    window,
    n_frames,
    hop_length=200,
    win_length=800,
    n_fft=800,
    dtype=np.float32,
    norm=None,
):
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
    return x


def griffin_lim(magnitudes, stft_fn, n_iters=30):
    """
    PARAMS
    ------
    magnitudes: spectrogram magnitudes
    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
    """

    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float32)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

    for i in range(n_iters):
        _, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return signal


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return torch.exp(x) / C

In [None]:
# export
def to_gpu(x):
    x = x.contiguous()

    if torch.cuda.is_available():
        x = x.cuda(non_blocking=True)
    return torch.autograd.Variable(x)

In [None]:
to_gpu(torch.randn(10, 10))

tensor([[-1.0952,  1.5098,  1.7472,  1.4762,  0.3244,  0.2247,  1.3386,  1.1365,
         -0.5723, -1.5680],
        [-0.4840, -0.4451, -0.8086,  0.2919, -0.4676,  2.4247, -0.0489, -0.0794,
          1.0378, -1.1225],
        [-0.3659, -0.6850, -1.2807, -0.0546, -0.7408,  0.1756,  0.1937, -0.5494,
          0.6397,  0.4977],
        [-2.4095, -1.1598,  0.1977, -0.8104,  1.2652, -0.4722,  0.8372, -0.2998,
         -0.4856, -1.7819],
        [-1.0124, -1.1938,  0.4984,  1.8554,  1.4143, -0.4297, -0.1580, -1.2449,
         -0.1589,  0.1825],
        [ 0.8220,  1.0507,  0.3190,  0.9669,  0.7793, -2.0233,  0.4074,  0.1102,
          1.1830, -0.3872],
        [ 0.5741, -1.8157,  0.8897,  0.9545, -1.3521,  1.3975, -1.0372, -0.3366,
          0.9322, -0.4041],
        [-0.4368, -1.1928,  0.7727, -0.1491,  0.5363, -1.0173,  0.5298,  1.5333,
          1.1553, -1.1345],
        [-0.5358,  0.1390,  0.8813,  0.6781,  0.9553,  0.0167,  0.7211,  0.2378,
         -0.1051, -1.2321],
        [-1.6739,  

In [None]:
# export


def get_mask_from_lengths(lengths: torch.Tensor, max_len: int = 0):
    """Return a mask matrix. Unmasked entires are true."""
    if max_len == 0:
        max_len = int(torch.max(lengths).item())
    ids = torch.arange(0, max_len, device=lengths.device, dtype=torch.long)
    mask = (ids < lengths.unsqueeze(1)).bool()
    return mask

In [None]:
import torch

assert (
    get_mask_from_lengths(torch.LongTensor([1, 3, 2, 1]))
    == torch.Tensor(
        [
            [True, False, False],
            [True, True, True],
            [True, True, False],
            [True, False, False],
        ]
    )
).all()

In [None]:
# export
import torch.distributed as dist


def reduce_tensor(tensor, n_gpus):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
    rt /= n_gpus
    return rt

In [None]:
# export
def subsequent_mask(length):
    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
    return mask

In [None]:
assert (subsequent_mask(2) == torch.tensor([[[1, 0], [1, 1]]])).all()

In [None]:
# export
def convert_pad_shape(pad_shape):
    """Reverse, then flatten a list of lists."""
    l = pad_shape[::-1]
    pad_shape = [item for sublist in l for item in sublist]
    return pad_shape

In [None]:
convert_pad_shape([[1, 2], [3, 4], [5, 6, 7]]) == [5, 6, 7, 3, 4, 1, 2]

True

In [None]:
# export
def sequence_mask(length, max_length=None):
    """The same as get_mask_from_lengths"""
    if max_length is None:
        max_length = length.max()
    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
    return x.unsqueeze(0) < length.unsqueeze(1)

In [None]:
assert (
    sequence_mask(torch.tensor([1, 3, 2, 1]))
    == torch.Tensor(
        [
            [True, False, False],
            [True, True, True],
            [True, True, False],
            [True, False, False],
        ]
    )
).all()
assert (
    sequence_mask(torch.tensor([1, 3, 2, 1]), 4)
    == torch.Tensor(
        [
            [True, False, False, False],
            [True, True, True, False],
            [True, True, False, False],
            [True, False, False, False],
        ]
    )
).all()

In [None]:
# export
def generate_path(duration, mask):
    """
    duration: [b, 1, t_x]
    mask: [b, 1, t_y, t_x]
    """
    device = duration.device

    b, _, t_y, t_x = mask.shape
    cum_duration = torch.cumsum(duration, -1)

    cum_duration_flat = cum_duration.view(b * t_x)
    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
    path = path.view(b, t_x, t_y)
    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
    path = path.unsqueeze(1).transpose(2, 3) * mask
    return path

In [None]:
# export
def rand_slice_segments(x, x_lengths=None, segment_size=4):
    b, d, t = x.size()
    if x_lengths is None:
        x_lengths = t
    ids_str_max = x_lengths - segment_size + 1
    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
    ret = slice_segments(x, ids_str, segment_size)
    return ret, ids_str

In [None]:
# export
def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(mean, std)

In [None]:
# export
def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)

In [None]:
# export
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    n_channels_int = n_channels[0]
    in_act = input_a + input_b
    t_act = torch.tanh(in_act[:, :n_channels_int, :])
    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
    acts = t_act * s_act
    return acts