<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
# default_exp utils.utils

In [None]:
# export

import sys
import os
import soundfile as sf
import pandas as pd

import numpy as np

import soundfile as sf
import librosa
from torch.nn import functional as F


def load_filepaths_and_text(filename: str, split: str = "|"):
    with open(filename, encoding="utf-8") as f:
        filepaths_and_text = [line.strip().split(split) for line in f]
    return filepaths_and_text

In [None]:
# export

import torch
import numpy as np
from scipy.signal import get_window
import librosa.util as librosa_util


def window_sumsquare(
    window,
    n_frames,
    hop_length=200,
    win_length=800,
    n_fft=800,
    dtype=np.float32,
    norm=None,
):
    """
    # from librosa 0.6
    Compute the sum-square envelope of a window function at a given hop length.

    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.

    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`

    n_frames : int > 0
        The number of analysis frames

    hop_length : int > 0
        The number of samples to advance between frames

    win_length : [optional]
        The length of the window function.  By default, this matches `n_fft`.

    n_fft : int > 0
        The length of each analysis frame.

    dtype : np.dtype
        The data type of the output

    Returns
    -------
    wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
        The sum-squared envelope of the window function
    """
    if win_length is None:
        win_length = n_fft

    n = n_fft + hop_length * (n_frames - 1)
    x = np.zeros(n, dtype=dtype)

    # Compute the squared window at the desired length
    win_sq = get_window(window, win_length, fftbins=True)
    win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
    win_sq = librosa_util.pad_center(win_sq, n_fft)

    # Fill the envelope
    for i in range(n_frames):
        sample = i * hop_length
        x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
    return x


def griffin_lim(magnitudes, stft_fn, n_iters=30):
    """
    PARAMS
    ------
    magnitudes: spectrogram magnitudes
    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
    """

    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float32)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

    for i in range(n_iters):
        _, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return signal


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return torch.exp(x) / C

In [None]:
# export
def to_gpu(x):
    x = x.contiguous()

    if torch.cuda.is_available():
        x = x.cuda(non_blocking=True)
    return torch.autograd.Variable(x)

In [None]:
to_gpu(torch.randn(10, 10))

tensor([[ 1.0415e+00, -3.7111e-01, -3.2838e-01, -6.2205e-01, -2.6169e-01,
         -9.8103e-01, -1.8675e+00, -7.7150e-01, -1.7760e-01, -6.8241e-01],
        [-5.1025e-01, -1.1788e+00,  1.2456e+00,  1.1683e+00, -1.5050e+00,
          3.8878e-01,  3.4331e-01, -4.6678e-02, -4.9398e-01, -1.3587e+00],
        [-2.0366e+00, -2.1385e-01,  1.2572e-01,  1.0080e+00, -4.1069e-01,
         -1.1727e+00, -5.8472e-01, -1.5301e-01, -7.7718e-01, -1.4033e-01],
        [-8.1851e-01,  1.1898e+00,  3.7606e-01, -2.1779e+00,  2.1037e-01,
         -1.0227e+00,  7.8290e-01,  9.2825e-01,  1.7582e+00, -6.0930e-02],
        [-2.2755e+00, -1.9536e+00,  2.1239e-01, -6.8103e-01, -1.1368e-02,
          5.7643e-01, -6.6171e-01, -3.3325e-01, -8.7990e-01,  1.0076e+00],
        [-3.9319e-01,  8.9259e-01, -6.4939e-01, -4.6909e-01, -8.9252e-01,
          9.5496e-01,  5.3429e-02, -1.4436e-01, -1.6097e-01, -3.0059e-01],
        [-3.0589e-01, -2.9385e-02, -1.1399e+00,  1.7461e+00,  1.1013e-01,
          4.2333e-01, -1.7093e+0

In [None]:
# export


def get_mask_from_lengths(lengths: torch.Tensor, max_len: int = 0):
    """Return a mask matrix. Unmasked entires are true."""
    if max_len == 0:
        max_len = int(torch.max(lengths).item())
    ids = torch.arange(0, max_len, device=lengths.device, dtype=torch.long)
    mask = (ids < lengths.unsqueeze(1)).bool()
    return mask

In [None]:
import torch

assert (
    get_mask_from_lengths(torch.LongTensor([1, 3, 2, 1]))
    == torch.Tensor(
        [
            [True, False, False],
            [True, True, True],
            [True, True, False],
            [True, False, False],
        ]
    )
).all()

In [None]:
# export
import torch.distributed as dist


def reduce_tensor(tensor, n_gpus):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.ReduceOp.SUM)
    rt /= n_gpus
    return rt

In [None]:
# export
def subsequent_mask(length):
    mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
    return mask

In [None]:
assert (subsequent_mask(2) == torch.tensor([[[1, 0], [1, 1]]])).all()

In [None]:
# export
def convert_pad_shape(pad_shape):
    """Reverse, then flatten a list of lists."""
    l = pad_shape[::-1]
    pad_shape = [item for sublist in l for item in sublist]
    return pad_shape

In [None]:
convert_pad_shape([[1, 2], [3, 4], [5, 6, 7]]) == [5, 6, 7, 3, 4, 1, 2]

True

In [None]:
# export
def sequence_mask(length, max_length=None):
    """The same as get_mask_from_lengths"""
    if max_length is None:
        max_length = length.max()
    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
    return x.unsqueeze(0) < length.unsqueeze(1)

In [None]:
assert (
    sequence_mask(torch.tensor([1, 3, 2, 1]))
    == torch.Tensor(
        [
            [True, False, False],
            [True, True, True],
            [True, True, False],
            [True, False, False],
        ]
    )
).all()
assert (
    sequence_mask(torch.tensor([1, 3, 2, 1]), 4)
    == torch.Tensor(
        [
            [True, False, False, False],
            [True, True, True, False],
            [True, True, False, False],
            [True, False, False, False],
        ]
    )
).all()

In [None]:
# export
def generate_path(duration, mask):
    """
    duration: [b, 1, t_x]
    mask: [b, 1, t_y, t_x]
    """
    device = duration.device

    b, _, t_y, t_x = mask.shape
    cum_duration = torch.cumsum(duration, -1)

    cum_duration_flat = cum_duration.view(b * t_x)
    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
    path = path.view(b, t_x, t_y)
    path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
    path = path.unsqueeze(1).transpose(2, 3) * mask
    return path

In [None]:
# export


def slice_segments(x, ids_str, segment_size=4):
    ret = torch.zeros_like(x[:, :, :segment_size])
    for i in range(x.size(0)):
        idx_str = ids_str[i]
        idx_end = idx_str + segment_size
        ret[i] = x[i, :, idx_str:idx_end]
    return ret


def rand_slice_segments(x, x_lengths=None, segment_size=4):
    b, d, t = x.size()
    if x_lengths is None:
        x_lengths = t
    ids_str_max = x_lengths - segment_size
    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
    ret = slice_segments(x, ids_str, segment_size)
    return ret, ids_str

In [None]:
# export
def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(mean, std)

In [None]:
# export
def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)

In [None]:
# export
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    n_channels_int = n_channels[0]
    in_act = input_a + input_b
    t_act = torch.tanh(in_act[:, :n_channels_int, :])
    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
    acts = t_act * s_act
    return acts

In [None]:
# export
def clip_grad_value_(parameters, clip_value, norm_type=2):
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]
    parameters = list(filter(lambda p: p.grad is not None, parameters))
    norm_type = float(norm_type)
    if clip_value is not None:
        clip_value = float(clip_value)

    total_norm = 0
    for p in parameters:
        param_norm = p.grad.data.norm(norm_type)
        total_norm += param_norm.item() ** norm_type
        if clip_value is not None:
            p.grad.data.clamp_(min=-clip_value, max=clip_value)
    total_norm = total_norm ** (1.0 / norm_type)
    return total_norm

In [None]:
# export


def intersperse(lst, item):
    result = [item] * (len(lst) * 2 + 1)
    result[1::2] = lst
    return result


def intersperse_emphases(emphases):
    for n in range(len(emphases)):
        emphases[n][0] = 2 * emphases[n][0]
        emphases[n][1] = 2 * emphases[n][1] + 1
    return emphases

In [None]:
intersperse([1, 2, 3, 4], 0) == [0, 1, 0, 2, 0, 3, 0, 4, 0]

True