In [None]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import ddsp
import ddsp.training
import numpy as np
import IPython.display

import gin
gin.enter_interactive_mode()

def play_audio(audio):
    audio = np.array(audio)
    audio = np.squeeze(audio)
    IPython.display.display(IPython.display.Audio(audio, rate=16000))

In [None]:
data_provider = ddsp.training.data.TFRecordProvider(
    file_pattern="/Users/vaclav/prog/thesis/data/violin2/violin2.tfrecord*",
    frame_rate=50,
    centered=True,
)

In [None]:
# TFLITE_FILE_PATH = "/cluster/scratch/vvolhejn/models/0503-ddspae-vst-cnn-2/export/tflite/model.tflite"
TFLITE_FILE_PATH = "/Volumes/euler/export/tflite/model.tflite"
interpreter = tf.lite.Interpreter(TFLITE_FILE_PATH)
my_signature = interpreter.get_signature_runner()

In [None]:

# my_signature is callable with input as arguments.
#output = my_signature(x=tf.constant([1.0], shape=(1,10), dtype=tf.float32))
n_samples = 64000

audio = tf.cast(tf.reshape(tf.sin(tf.linspace(0, 2000, 64000) + (tf.linspace(0, 1, 64000) ** 2) * 2000), [64000]), tf.float32)

output = my_signature(
    audio=tf.constant(audio, shape=(n_samples,), dtype=tf.float32),
)

In [None]:
output

In [None]:
harm_audio, final_phase = ddsp.core.streaming_harmonic_synthesis(
    frequencies=output["f0_hz"],
    amplitudes=output["amplitudes"],
    harmonic_distribution=output["harmonic_distribution"],
    initial_phase=tf.constant([0], shape=(1,1,1), dtype=tf.float32),
    n_samples=n_samples,
    sample_rate=16000,
    amp_resample_method="linear")

filtered_noise = ddsp.synths.FilteredNoise(n_samples=n_samples, window_size=0)

noise_audio = filtered_noise.get_signal(tf.expand_dims(output["noise_magnitudes"], axis=0))
audio_out = harm_audio + noise_audio

In [None]:
play_audio(audio_out)

In [None]:
output["output_3"]

In [None]:
output["output_1"].shape

In [None]:
output["output_0"]

In [None]:
my_signature.get_input_details()

In [None]:
def test(layers_per_stack, kernel_size, stacks):
    dec = ddsp.training.decoders.DilatedConvDecoder(
        ch=128,
        layers_per_stack=layers_per_stack,
        kernel_size=kernel_size,
        norm_type="layer",
        input_keys=("pw_scaled", "f0_scaled"),
        stacks=stacks,
        conditioning_keys = None,  # Nothing else than a latent, so no need to consider this separately
        precondition_stack = None,  # Not relevant since `conditioning_keys = None`
    #    output_splits = (('control_embedding', %decoder_output_channels),)
        output_splits = (('amps', 1),
                                  ('harmonic_distribution', 60),
                                  ('noise_magnitudes', 65)),
        resample_after_convolve = False,
    )

    n = 500
    y = dec({
        "pw_scaled": tf.constant([[0.5] * n], shape=(1,n,1), dtype=tf.float32),
        "f0_scaled": tf.constant([[0.5] * n], shape=(1,n,1), dtype=tf.float32)
    })
    dropped_actual = n - y["amps"].shape[1]

    stacks_correction = (kernel_size - 1) * (stacks - 1)
    dropped_predicted = (kernel_size - 1) * (stacks * 2 ** layers_per_stack) - stacks_correction

    msg = (f"predicted {dropped_predicted} and got {dropped_actual} "
        f"({layers_per_stack} {kernel_size} {stacks}) -> {dropped_predicted - dropped_actual}")
    print(msg)
    # assert dropped_predicted == dropped_actual, msg

In [None]:
for layers_per_stack in [1,2,3]:
    for kernel_size in [2,3]:
        for stacks in [1,2,3,4]:
            test(layers_per_stack, kernel_size, stacks)