In [None]:
import thesis.rave
%load_ext autoreload
%autoreload 2

In [None]:
import copy

import os
import time

import crepe
import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (
    auto_tune, get_tuning_factor, download,
    play, record, specplot, upload, audio_bytes_to_np,
    DEFAULT_SAMPLE_RATE)
from ddsp.training.postprocessing import (
    detect_notes, fit_quantile_transform
)
import gin
# from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pickle
# import tensorflow.compat.v2 as tf
import tensorflow as tf
import tensorflow_datasets as tfds
import einops

# Helper Functions
# sample_rate = DEFAULT_SAMPLE_RATE  # 16000
sample_rate = 16000

def play_audio(audio):
    audio = np.array(audio)
    audio = np.squeeze(audio)
    IPython.display.display(IPython.display.Audio(audio, rate=16000))

print('Done!')

In [None]:
input_f = open("../data/audio/violin/II. Double.mp3", "rb")
# input_f = open("../data/audio/violin/III. Corrente.mp3", "rb")
# input_f = open("../data/audio/flute/3 Fantaisies for Solo Flute, Op. 38 - Fantaisie no. 1.mp3", "rb")
wav_bytes = input_f.read()
audio = audio_bytes_to_np(wav_bytes)
audio = audio[:sample_rate * 10]

if len(audio.shape) == 1:
    audio = audio[np.newaxis, :]

In [None]:
specplot(audio)

import IPython.display

IPython.display.Audio(audio, rate=DEFAULT_SAMPLE_RATE)

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.metrics.compute_audio_features(audio)
# audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features['loudness_db'] = audio_features["loudness_db"].numpy().astype("float32")
audio_features_mod = None
print('Audio features took %.1f seconds' % (time.time() - start_time))

TRIM = -15
# Plot Features.
fig, ax = plt.subplots(nrows=3,
                       ncols=1,
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'][:TRIM])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features['f0_confidence'][:TRIM])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')



In [None]:
af = {k: v.copy() for k, v in audio_features.items()}

In [None]:
af0 = {k: v.copy() for k, v in audio_features.items()}

In [None]:
# Set path here
model_dir ="../data/models/0323-halfrave-1"
gin_file = os.path.join(model_dir, "operative_config-0.gin")

#foo (1, 64000) (1, 2500, 128)

In [None]:
# for the original DDSP model
# model_dir ="../data/train"
# gin_file = os.path.join(model_dir, "operative_config-2700.gin")

In [None]:
gin.enter_interactive_mode()

# Load the dataset statistics.
DATASET_STATS = None
dataset_stats_file = os.path.join(model_dir, 'dataset_statistics.pkl')
print(f'Loading dataset statistics from {dataset_stats_file}')
try:
    if tf.io.gfile.exists(dataset_stats_file):
        with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f:
            DATASET_STATS = pickle.load(f)
except Exception as err:
    print('Loading dataset statistics from pickle failed: {}.'.format(err))

In [None]:
from thesis.timbre_transfer_util import adjust_batch
af = {k: v.copy() for k, v in audio_features.items()}
audio_features_mod = adjust_batch(af, DATASET_STATS)

In [None]:
mask_on = audio_features_mod["mask_on"][0]

# Plot Features.
has_mask = int(mask_on is not None)
n_plots = 3 if has_mask else 2
fig, axes = plt.subplots(nrows=n_plots,
                         ncols=1,
                         sharex=True,
                         figsize=(2 * n_plots, 8))

if has_mask:
    threshold = 1
    ax = axes[0]
    ax.plot(np.ones_like(mask_on[:TRIM]) * threshold, 'k:')
    # ax.plot(note_on_value[:TRIM])
    ax.plot(mask_on[:TRIM])
    ax.set_ylabel('Note-on Mask')
    ax.set_xlabel('Time step [frame]')
    ax.legend(['Threshold', 'Likelihood', 'Mask'])

ax = axes[0 + has_mask]
ax.plot(audio_features['loudness_db'][:TRIM])
ax.plot(audio_features_mod['loudness_db'][0,:TRIM])
ax.set_ylabel('loudness_db')
ax.legend(['Original', 'Adjusted'])

ax = axes[1 + has_mask]
ax.plot(librosa.hz_to_midi(audio_features['f0_hz'][:TRIM]))
ax.plot(librosa.hz_to_midi(audio_features_mod['f0_hz'][0,:TRIM]))
ax.set_ylabel('f0 [midi]')
_ = ax.legend(['Original', 'Adjusted'])

In [None]:

af = audio_features if audio_features_mod is None else audio_features_mod

# Run a batch of predictions.
start_time = time.time()
outputs = model(af, training=False)
audio_gen = model.get_audio_from_outputs(outputs)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
# play(audio)
IPython.display.display(IPython.display.Audio(audio, rate=DEFAULT_SAMPLE_RATE))

print('Resynthesis')
IPython.display.display(IPython.display.Audio(audio_gen[:,:], rate=DEFAULT_SAMPLE_RATE))
# play(audio_gen)
specplot(audio)
plt.title("Original")

specplot(audio_gen[:,:])
_ = plt.title("Resynthesis")