In [None]:
import thesis.rave
%load_ext autoreload
%autoreload 2

In [None]:
import copy

import os
import time

import crepe
import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (
    auto_tune, get_tuning_factor, download,
    play, record, specplot, upload, audio_bytes_to_np,
    DEFAULT_SAMPLE_RATE)
from ddsp.training.postprocessing import (
    detect_notes, fit_quantile_transform
)
import gin
# from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pickle
# import tensorflow.compat.v2 as tf
import tensorflow as tf
import tensorflow_datasets as tfds
import einops

# Helper Functions
# sample_rate = DEFAULT_SAMPLE_RATE  # 16000
sample_rate = 16000

def play_audio(audio):
    audio = np.array(audio)
    audio = np.squeeze(audio)
    IPython.display.display(IPython.display.Audio(audio, rate=16000))

print('Done!')

In [None]:
input_f = open("../data/audio/violin/II. Double.mp3", "rb")
wav_bytes = input_f.read()
audio = audio_bytes_to_np(wav_bytes)
audio = audio[:sample_rate * 10]

if len(audio.shape) == 1:
    audio = audio[np.newaxis, :]

In [None]:
specplot(audio)

import IPython.display

IPython.display.Audio(audio, rate=DEFAULT_SAMPLE_RATE)

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.metrics.compute_audio_features(audio)
# audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features['loudness_db'] = audio_features["loudness_db"].numpy().astype("float32")
audio_features_mod = None
print('Audio features took %.1f seconds' % (time.time() - start_time))

TRIM = -15
# Plot Features.
fig, ax = plt.subplots(nrows=3,
                       ncols=1,
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'][:TRIM])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features['f0_confidence'][:TRIM])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')



In [None]:
# 109s

In [None]:
# Set path here
model_dir ="../data/models/0323-halfrave-1"
gin_file = os.path.join(model_dir, "operative_config-0.gin")

#foo (1, 64000) (1, 2500, 128)

In [None]:
# for the original DDSP model
# model_dir ="../data/train"
# gin_file = os.path.join(model_dir, "operative_config-2700.gin")

In [None]:
gin.enter_interactive_mode()

# Load the dataset statistics.
DATASET_STATS = None
dataset_stats_file = os.path.join(model_dir, 'dataset_statistics.pkl')
print(f'Loading dataset statistics from {dataset_stats_file}')
try:
    if tf.io.gfile.exists(dataset_stats_file):
        with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f:
            DATASET_STATS = pickle.load(f)
except Exception as err:
    print('Loading dataset statistics from pickle failed: {}.'.format(err))

# Parse gin config,
with gin.unlock_config():
    gin.parse_config_file(gin_file, skip_unknown=True)

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir, ckpt_name)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps')

# n_samples_train = gin.query_parameter('n_samples')
# TODO: How to read this from gin? `gin.query_parameter("NEWTHarmonic.samples_train")`
# returns "%n_samples" rather than the value of the `n_samples` macro.
n_samples_train = 64000
# print(time_steps_train, n_samples_train)
hop_size = int(n_samples_train / time_steps_train)


time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

# print("===Trained model===")
# print("Time Steps", time_steps_train)
# print("Samples", n_samples_train)
# print("Hop Size", hop_size)
# print("\n===Resynthesis===")
# print("Time Steps", time_steps)
# print("Samples", n_samples)
# print('')

gin_params = [
    'RAVEWaveformGenerator.n_samples = {}'.format(n_samples),
    'MultibandFilteredNoise.n_samples = {}'.format(n_samples),
    'F0LoudnessPreprocessor.time_steps = {}'.format(time_steps),
    'oscillator_bank.use_angular_cumsum = True',  # Avoids cumsum accumulation errors.
]

with gin.unlock_config():
    gin.parse_config(gin_params)

# Trim all input vectors to correct lengths
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
    audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]

# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(ckpt)

# Build model by running a batch through it.
start_time = time.time()
_ = model(audio_features, training=False)
print('Restoring model took %.1f seconds' % (time.time() - start_time))

In [None]:
#@title Modify conditioning

#@markdown These models were not explicitly trained to perform timbre transfer, so they may sound unnatural if the incoming loudness and frequencies are very different then the training data (which will always be somewhat true).


#@markdown ## Note Detection

#@markdown You can leave this at 1.0 for most cases
threshold = 1.5  #@param {type:"slider", min: 0.0, max:2.0, step:0.01}

#@markdown ## Automatic

ADJUST = True  #@param{type:"boolean"}

#@markdown Quiet parts without notes detected (dB)
quiet = 20  #@param {type:"slider", min: 0, max:60, step:1}

#@markdown Force pitch to nearest note (amount)
autotune = 0  #@param {type:"slider", min: 0.0, max:1.0, step:0.1}

#@markdown ## Manual


#@markdown Shift the pitch (octaves)
pitch_shift = -1  #@param {type:"slider", min:-2, max:2, step:1}

#@markdown Adjust the overall loudness (dB)
loudness_shift = 0  #@param {type:"slider", min:-20, max:20, step:1}

audio_features_mod = {k: v.copy() for k, v in audio_features.items()}


## Helper functions.
def shift_ld(audio_features, ld_shift=0.0):
    """Shift loudness by a number of ocatves."""
    audio_features['loudness_db'] += ld_shift
    return audio_features


def shift_f0(audio_features, pitch_shift=0.0):
    """Shift f0 by a number of ocatves."""
    audio_features['f0_hz'] *= 2.0 ** (pitch_shift)
    audio_features['f0_hz'] = np.clip(audio_features['f0_hz'],
                                      0.0,
                                      librosa.midi_to_hz(110.0))
    return audio_features


mask_on = None

if ADJUST and DATASET_STATS is not None:
    # Detect sections that are "on".
    mask_on, note_on_value = detect_notes(audio_features['loudness_db'],
                                          audio_features['f0_confidence'],
                                          threshold)

    if np.any(mask_on):
        # Shift the pitch register.
        target_mean_pitch = DATASET_STATS['mean_pitch']
        pitch = ddsp.core.hz_to_midi(audio_features['f0_hz'])
        mean_pitch = np.mean(pitch[mask_on])
        p_diff = target_mean_pitch - mean_pitch
        p_diff_octave = p_diff / 12.0
        round_fn = np.floor if p_diff_octave > 1.5 else np.ceil
        p_diff_octave = round_fn(p_diff_octave)
        audio_features_mod = shift_f0(audio_features_mod, p_diff_octave)

        # Quantile shift the note_on parts.
        _, loudness_norm = fit_quantile_transform(
            audio_features['loudness_db'],
            mask_on,
            inv_quantile=DATASET_STATS['quantile_transform'])

        # Turn down the note_off parts.
        mask_off = np.logical_not(mask_on)
        loudness_norm[mask_off] -= quiet * (1.0 - note_on_value[mask_off][:, np.newaxis])
        loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape)

        audio_features_mod['loudness_db'] = loudness_norm

        # Auto-tune.
        if autotune:
            f0_midi = np.array(ddsp.core.hz_to_midi(audio_features_mod['f0_hz']))
            tuning_factor = get_tuning_factor(f0_midi, audio_features_mod['f0_confidence'], mask_on)
            f0_midi_at = auto_tune(f0_midi, tuning_factor, mask_on, amount=autotune)
            audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at)

    else:
        print('\nSkipping auto-adjust (no notes detected or ADJUST box empty).')

else:
    print('\nSkipping auto-adujst (box not checked or no dataset statistics found).')

# Manual Shifts.
audio_features_mod = shift_ld(audio_features_mod, loudness_shift)
audio_features_mod = shift_f0(audio_features_mod, pitch_shift)

# audio_features_mod["f0_hz"] = np.sin(np.linspace(0, 20, audio_features_mod["f0_hz"].shape[0])) * 100 + 600. + np.linspace(0, 500, audio_features_mod["f0_hz"].shape[0])

# Plot Features.
has_mask = int(mask_on is not None)
n_plots = 3 if has_mask else 2
fig, axes = plt.subplots(nrows=n_plots,
                         ncols=1,
                         sharex=True,
                         figsize=(2 * n_plots, 8))

if has_mask:
    ax = axes[0]
    ax.plot(np.ones_like(mask_on[:TRIM]) * threshold, 'k:')
    ax.plot(note_on_value[:TRIM])
    ax.plot(mask_on[:TRIM])
    ax.set_ylabel('Note-on Mask')
    ax.set_xlabel('Time step [frame]')
    ax.legend(['Threshold', 'Likelihood', 'Mask'])

ax = axes[0 + has_mask]
ax.plot(audio_features['loudness_db'][:TRIM])
ax.plot(audio_features_mod['loudness_db'][:TRIM])
ax.set_ylabel('loudness_db')
ax.legend(['Original', 'Adjusted'])

ax = axes[1 + has_mask]
ax.plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]))
ax.plot(librosa.hz_to_midi(audio_features_mod['f0_hz'][:TRIM]))
ax.set_ylabel('f0 [midi]')
_ = ax.legend(['Original', 'Adjusted'])

In [None]:
#@title #Resynthesize Audio

af = audio_features if audio_features_mod is None else audio_features_mod

# Run a batch of predictions.
start_time = time.time()
outputs = model(af, training=False)
audio_gen = model.get_audio_from_outputs(outputs)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
# play(audio)
IPython.display.display(IPython.display.Audio(audio, rate=DEFAULT_SAMPLE_RATE))

print('Resynthesis')
IPython.display.display(IPython.display.Audio(audio_gen[:,:], rate=DEFAULT_SAMPLE_RATE))
# play(audio_gen)
specplot(audio)
plt.title("Original")

specplot(audio_gen[:,:])
_ = plt.title("Resynthesis")

In [None]:
audio_gen

In [None]:
outputs.keys()

In [None]:
from thesis.util import resample
from thesis.pqmf import PQMFBank
pqmf = PQMFBank(n_bands=16, attenuation=100)

In [None]:
waveform = pqmf.synthesis(outputs["rave_waveform_generator"]["signal"])
play_audio(waveform)
specplot(waveform[:,:,0])

In [None]:
noise = pqmf.synthesis(outputs["multiband_filtered_noise"]["signal"])
play_audio(noise)
specplot(noise[:,:,0])

In [None]:
plt.plot(outputs["crop"]["signal"][0,::100,50:55])

In [None]:
temp0 = outputs["rave_waveform_generator"]["signal"].numpy()
a2 = resample(temp0, temp0.shape[1] * 16)

In [None]:
a2.shape

In [None]:
play_audio(a2[:,:,0])

In [None]:
import thesis.rave
waveform_gen: thesis.rave.RAVEWaveformGenerator = model.processor_group.processors[1]

In [None]:
control_embedding = outputs["crop"]["signal"]

In [None]:
loudness = waveform_gen.loud_gen(control_embedding)

In [None]:
waveform = tf.tanh(waveform_gen.wave_gen(control_embedding))
waveform_single = pqmf.synthesis(waveform)

In [None]:
plt.plot(loudness[0,::100,0])
plt.plot(waveform_single[0,::100,0])

In [None]:
plt.plot(waveform[0,1000:1050,:4])

In [None]:
control_embedding.shape

In [None]:
specplot(waveform[0,:,0])

In [None]:
play_audio(waveform_single)

In [None]:
waveform_gen.call(waveform_gen_input)