Why does the RAVE-style decoder produce metallic artifacts? Could it be overfitting to the loss?

In [None]:
%load_ext autoreload
%autoreload 2
import gin

gin.enter_interactive_mode()

import tensorflow as tf
from thesis.notebook_util import play_audio, audio_bytes_to_np, specplot
import tqdm.notebook as tqdm
import numpy as np
import matplotlib.pyplot as plt

from ddsp.losses import SpectralLoss

In [None]:
def read_wav(path):
    with open(path, "rb") as f:
        audio = audio_bytes_to_np(f.read())

    n = len(audio)
    audio_synth = audio[:n // 2]
    audio_true = audio[n // 2:]

    return audio_synth, audio_true


def compare_losses(audio_synth, audio_true, n=256, crop_back=False, max_random_crop=0):
    # Defaults from DDSP
    loss_op = SpectralLoss(
        logmag_weight=1.0, mag_weight=1.0, loss_type="L1", max_random_crop=max_random_crop
    )

    # audio_synth = tf.convert_to_tensor(audio_synth)
    # audio_true = tf.convert_to_tensor(audio_true)

    losses = []
    for i in tqdm.trange(n):
        if crop_back:
            losses.append(loss_op(audio_synth[i:i + 2 ** 14], audio_true[i:i + 2 ** 14]))
        else:
            losses.append(loss_op(audio_synth[i:], audio_true[i:]))

    return np.array(losses)

In [None]:
def rave_vs_ddsp(**kwargs):
    audio_synth, audio_true = read_wav("/Users/vaclav/prog/thesis/data/audio/reconstruction_rave.wav")
    losses_rave = compare_losses(audio_synth, audio_true, **kwargs)

    audio_synth, audio_true = read_wav("/Users/vaclav/prog/thesis/data/audio/reconstruction_ddsp.wav")
    losses_ddsp = compare_losses(audio_synth, audio_true, **kwargs)

    plt.plot(losses_rave)
    plt.plot(losses_ddsp)
    plt.show()
    return losses_rave, losses_ddsp

In [None]:
rave_vs_ddsp(n=128, max_random_crop=512)

In [None]:
rave_vs_ddsp(n=128, max_random_crop=0)

In [None]:
loss_op(x, y)

In [None]:
loss_op = SpectralLoss(
    logmag_weight=1.0, mag_weight=1.0, loss_type="L1", max_random_crop=512
)
audio_synth, audio_true = read_wav("/Users/vaclav/prog/thesis/data/audio/reconstruction_rave.wav")

with tf.GradientTape() as tape:
    a = tf.Variable(0.5)
    x = tf.constant(audio_synth)
    y = x * a
    loss = loss_op(x, y)

# Clip and apply gradients.
# logging.info("Outputs:", [(k, v.shape) for k, v in outputs.items()])

grads = tape.gradient(loss, a)
grads

In [None]:
audio_synth, audio_true = read_wav("/Users/vaclav/prog/thesis/data/audio/reconstruction_rave.wav")

In [None]:
from random import random
from scipy.signal import lfilter

def random_angle(min_f=20, max_f=8000, sr=24000):
    min_f = np.log(min_f)
    max_f = np.log(max_f)
    rand = np.exp(random() * (max_f - min_f) + min_f)
    rand = 2 * np.pi * rand / sr
    return rand


def pole_to_z_filter(omega, amplitude=.9):
    z0 = amplitude * np.exp(1j * omega)
    a = [1, -2 * np.real(z0), abs(z0)**2]
    b = [abs(z0)**2, -2 * np.real(z0), 1]
    return b, a


def random_phase_mangle(x, min_f, max_f, amp, sr):
    angle = random_angle(min_f, max_f, sr)
    print(angle)
    angle = -np.pi / 2
    b, a = pole_to_z_filter(angle, amp)
    return lfilter(b, a, x)

In [None]:
audio_true_2 = random_phase_mangle(audio_true, 20, 2000, .99, 16000)
plt.plot(audio_true[:100])
plt.plot(audio_true_2[:100])

In [None]:
import librosa
audio_true_3 = librosa.effects.pitch_shift(audio_true, sr=16000, n_steps=1)

In [None]:
play_audio(audio_true_3)

In [None]:
play_audio(audio_true_2)