In [14]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

import soundfile as sf

def load_audio(file_path, sr=22050, duration=5, target_shape=(128, 128)):
    audio, _ = librosa.load(file_path, sr=sr, duration=duration)
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=target_shape[1])
    log_spectrogram = librosa.power_to_db(spectrogram).T

    # Normalize spectrogram to [0, 1]
    log_spectrogram = (log_spectrogram - np.min(log_spectrogram)) / (np.max(log_spectrogram) - np.min(log_spectrogram) + 1e-10)

    # Ensure the spectrogram has the target shape
    if log_spectrogram.shape[0] > target_shape[0]:
        log_spectrogram = log_spectrogram[:target_shape[0], :]  # Crop
    elif log_spectrogram.shape[0] < target_shape[0]:
        padding = target_shape[0] - log_spectrogram.shape[0]
        log_spectrogram = np.pad(log_spectrogram, ((0, padding), (0, 0)), mode='constant')  # Pad

    return log_spectrogram

# Define the VAE
latent_dim = 16

# Encoder
def build_encoder(input_shape):
    inputs = layers.Input(shape=input_shape)
    x = layers.Flatten()(inputs)
    x = layers.Dense(128, activation='relu')(x)
    z_mean = layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
    z = layers.Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
    return tf.keras.Model(inputs, [z_mean, z_log_var, z], name="encoder")

# Sampling function
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# Decoder
def build_decoder(output_shape):
    latent_inputs = layers.Input(shape=(latent_dim,))
    x = layers.Dense(128, activation='relu')(latent_inputs)
    x = layers.Dense(np.prod(output_shape), activation='sigmoid')(x)
    outputs = layers.Reshape(output_shape)(x)
    return tf.keras.Model(latent_inputs, outputs, name="decoder")

# Update the VAE loss function to work with symbolic tensors
class VAELossLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        inputs, outputs, z_mean, z_log_var = inputs

        # Stabilize reconstruction loss
        reconstruction_loss = tf.reduce_mean(tf.square(inputs - outputs))  # Mean Squared Error

        # Stabilize KL divergence loss
        kl_loss = -0.5 * tf.reduce_sum(
            1 + tf.clip_by_value(z_log_var, -10.0, 10.0) - tf.square(z_mean) - tf.exp(tf.clip_by_value(z_log_var, -10.0, 10.0)),
            axis=-1
        )

        # Add a small epsilon to avoid NaN
        total_loss = reconstruction_loss + tf.reduce_mean(kl_loss + 1e-10)
        self.add_loss(total_loss)
        return outputs

# Build and compile the VAE
input_shape = (128, 128)  # Example spectrogram shape
encoder = build_encoder(input_shape)
decoder = build_decoder(input_shape)

inputs = layers.Input(shape=input_shape)
z_mean, z_log_var, z = encoder(inputs)
outputs = decoder(z)

# Add the custom loss layer
vae_outputs = VAELossLayer()([inputs, outputs, z_mean, z_log_var])
vae = tf.keras.Model(inputs, vae_outputs, name="vae")
vae.compile(optimizer='adam')

# Load data and train
file_path = "../data/raw/07282016HFUUforum_SLASH_07-28-2016_HFUUforum_DOT_mp3_00000.wav"
spectrogram = load_audio(file_path)
spectrogram = np.expand_dims(spectrogram, axis=0)  # Add batch dimension

vae.fit(spectrogram, spectrogram, epochs=10, batch_size=1)

# Generate new audio
latent_sample = tf.random.normal(shape=(1, latent_dim))
generated_spectrogram = decoder(latent_sample).numpy().squeeze()

# Debugging the generated spectrogram
print("Generated Spectrogram Stats:")
print(f"Min: {np.min(generated_spectrogram)}, Max: {np.max(generated_spectrogram)}")
print(f"Contains NaN: {np.isnan(generated_spectrogram).any()}, Contains Inf: {np.isinf(generated_spectrogram).any()}")

# Ensure the spectrogram has valid values
generated_spectrogram = np.nan_to_num(generated_spectrogram, nan=1e-10, posinf=1e-10, neginf=1e-10)
generated_spectrogram = np.clip(generated_spectrogram, a_min=1e-10, a_max=None)  # Avoid log(0) issues

# Convert spectrogram back to audio
try:
    generated_audio = librosa.feature.inverse.mel_to_audio(librosa.db_to_power(generated_spectrogram.T))
    sf.write("../output/generated_audio_VAE.wav", generated_audio, samplerate=22050)
    print("Audio successfully generated and saved.")
except Exception as e:
    print(f"Error during audio generation: {e}")

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step - loss: 3.1527
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551ms/step - loss: 3.1527
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 770.2202
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 770.2202
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 128.5314
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 128.5314
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 184.7474
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 184.7474
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 250.3753
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 250.3753
Ep