<a href="https://colab.research.google.com/github/yakirhelets/ddsp_demo/blob/master/DDSPDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#@title #Install and Import

#@markdown Install ddsp, define some helper functions, and clone the sources required for the demo. This transfers a lot of data and _should take a few minutes_.
#@markdown * Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
#@markdown * Press the ▶️ button on the left of each of the cells

%tensorflow_version 2.x

print('Installing from pip package...')
!pip install -qU ddsp
print('\nHang on! Copying sources...')
!git clone https://github.com/yakirhelets/ddsp_demo.git

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time

import crepe
import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (download, play, record, specplot, upload,
                                    DEFAULT_SAMPLE_RATE, audio_bytes_to_np)
import gin
from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

# Helper Functions
sample_rate = DEFAULT_SAMPLE_RATE  # 16000


print('Done!')

In [0]:
#@title Upload Audio File
#@markdown * Either use one of the provided examples or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)
#@markdown * Extracts fundmanetal frequency (f0) and loudness features. 

example_or_upload = "Singing - Somwhere over the rainbow"  #@param ["Singing - Somwhere over the rainbow", "Singing - Easy", "Guitar - Hysteria", "Upload (.mp3 or .wav)"]

examples_path = "/content/ddsp_demo/sources/audio_examples/"

if example_or_upload is not "Upload (.mp3 or .wav)":
  def get_example(example):
    return {
        "Singing - Somwhere over the rainbow": 'singing-somewhere_over_the_rainbow.mp3',
        "Singing - Easy": 'singing-commodores_easy.mp3',
        "Guitar - Hysteria": 'guitar-muse_hysteria.mp3'
    }[example]

  example_path = examples_path + get_example(example_or_upload)

  with open(example_path, "rb") as file_audio:
    audio_as_bytes = file_audio.read()
  audio = audio_bytes_to_np(audio_as_bytes)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]
audio = audio[np.newaxis, :]
print('\nExtracting audio features...')

# Plot.
specplot(audio)
play(audio)

# Setup the session.
ddsp.spectral_ops.reset_crepe()

# Compute features.
start_time = time.time()
audio_features = ddsp.training.eval_util.compute_audio_features(audio)
audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32)
audio_features_mod = None
print('Audio features took %.1f seconds' % (time.time() - start_time))


# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'])
ax[0].set_ylabel('loudness_db')

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz']))
ax[1].set_ylabel('f0 [midi]')

ax[2].plot(audio_features['f0_confidence'])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

In [0]:
#@title Choose a model

model = 'Double Bass (34,800 steps)' #@param ['Double Bass (34,800 steps)', 'Double Bass (18,000 steps)', 'Acoustic Guitar (91,800 steps)', 'Electric Guitar (11,400 steps)']
MODEL = model

models_path = '/content/ddsp_demo/sources/models'

def get_model_dir(x):
    return {
        'Double Bass (18,000 steps)': '/dbass_18000',
        'Double Bass (34,800 steps)': '/dbass_34800',
        'Acoustic Guitar (91,800 steps)': '/acoustic_91800',
        'Electric Guitar (11,400 steps)': '/electric_11400'
    }[x]

model_dir = models_path + get_model_dir(model)
gin_file = os.path.join(model_dir, 'operative_config-0.gin')

# Parse gin config,
with gin.unlock_config():
  gin.parse_config_file(gin_file, skip_unknown=True)

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir, ckpt_name)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('DefaultPreprocessor.time_steps')
n_samples_train = gin.query_parameter('Additive.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size

gin_params = [
    'Additive.n_samples = {}'.format(n_samples),
    'FilteredNoise.n_samples = {}'.format(n_samples),
    'DefaultPreprocessor.time_steps = {}'.format(time_steps),
]

with gin.unlock_config():
  gin.parse_config(gin_params)


# Trim all input vectors to correct lengths 
for key in ['f0_hz', 'f0_confidence', 'loudness_db']:
  audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:, :n_samples]


# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.Autoencoder()
model.restore(ckpt)

# Build model by running a batch through it.
start_time = time.time()
_ = model(audio_features, training=False)
print('Restoring model took %.1f seconds' % (time.time() - start_time))

In [0]:
#@title Modify conditioning

#@markdown These models were not explicitly trained to perform timbre transfer, so they may sound unnatural if the incoming loudness and frequencies are very different then the training data (which will always be somewhat true). 

#@markdown Manual adjustments:
#@markdown * <b>f0_octave_shift</b> - Shift the fundmental frequency to a more natural register.
#@markdown * <b>f0_confidence_threshold</b> - Silence audio below a threshold.
#@markdown * <b>loudness_db_shift</b> - Adjsut the overall loudness level.
f0_octave_shift =  0 #@param {type:"slider", min:-2, max:2, step:1}
f0_confidence_threshold =  0 #@param {type:"slider", min:0.0, max:1.0, step:0.05}
loudness_db_shift = 0 #@param {type:"slider", min:-20, max:20, step:1}

#@markdown You might get more realistic sounds by shifting a few dB, or try going extreme and see what weird sounds you can make...

audio_features_mod = {k: v.copy() for k, v in audio_features.items()}


## Helper functions.
def shift_ld(audio_features, ld_shift=0.0):
  """Shift loudness by a number of ocatves."""
  audio_features['loudness_db'] += ld_shift
  return audio_features


def shift_f0(audio_features, f0_octave_shift=0.0):
  """Shift f0 by a number of ocatves."""
  audio_features['f0_hz'] *= 2.0 ** (f0_octave_shift)
  audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 
                                    0.0, 
                                    librosa.midi_to_hz(110.0))
  return audio_features


def mask_by_confidence(audio_features, confidence_level=0.1):
  """For the violin model, the masking causes fast dips in loudness. 
  This quick transient is interpreted by the model as the "plunk" sound.
  """
  mask_idx = audio_features['f0_confidence'] < confidence_level
  audio_features['f0_hz'][mask_idx] = 0.0
  # audio_features['loudness_db'][mask_idx] = -ddsp.spectral_ops.LD_RANGE
  return audio_features


audio_features_mod = shift_ld(audio_features_mod, loudness_db_shift)
audio_features_mod = shift_f0(audio_features_mod, f0_octave_shift)
audio_features_mod = mask_by_confidence(audio_features_mod, f0_confidence_threshold)


# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness_db'])
ax[0].plot(audio_features_mod['loudness_db'])
ax[0].set_ylabel('loudness_db')
ax[0].legend(['Original','Adjusted'])

ax[1].plot(librosa.hz_to_midi(audio_features['f0_hz']))
ax[1].plot(librosa.hz_to_midi(audio_features_mod['f0_hz']))
ax[1].set_ylabel('f0 [midi]')
ax[1].legend(['Original','Adjusted'])

ax[2].plot(audio_features_mod['f0_confidence'])
ax[2].plot(np.ones_like(audio_features_mod['f0_confidence']) * f0_confidence_threshold)
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

In [0]:
#@title #Resynthesize Audio

af = audio_features if audio_features_mod is None else audio_features_mod

# Run a batch of predictions.
start_time = time.time()
audio_gen = model(af, training=False)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
play(audio)

print('Resynthesis')
play(audio_gen)

specplot(audio)
plt.title("Original")

specplot(audio_gen)
_ = plt.title("Resynthesis")