In [38]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf

In [39]:
df = pd.read_csv('dataset.csv')

In [40]:
df.head()

Unnamed: 0,path,lat,long
0,sounds/169884.mp3,45.094262,7.524798
1,sounds/169885.mp3,45.094716,7.525034
2,sounds/697381.mp3,16.910815,96.148424
3,sounds/187893.mp3,41.386662,2.17966
4,sounds/788102.mp3,21.307358,-157.858465


In [41]:
df['lat'] /= 90
df['long'] /= 180

In [42]:
audio = list()

for i, row in df.iterrows():
    y, sr = librosa.load(row['path'])

    y = np.pad(y, (0, max(0, 44100 - len(y))), mode="constant")[:44100]

    audio.append(y)

In [43]:
y = np.array(audio)
y.shape

(210, 44100)

In [44]:
X = df[['lat', 'long']].to_numpy()

In [45]:
X.shape

(210, 2)

In [46]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# Assuming audio_shape = (44100,) for each audio sample
audio_shape = (44100,)

# Encoder: Maps raw audio to a 2D latent space (latitude, longitude)
audio_input = layers.Input(shape=audio_shape)  # Input shape for raw audio

x = layers.Reshape((audio_shape[0], 1))(audio_input)  # Reshape to (44100, 1) for 1D convolutions
x = layers.Conv1D(32, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(64, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='tanh')(x)

# Latent space: latitude and longitude
latent_latitude = layers.Dense(1, name='latitude')(x)
latent_longitude = layers.Dense(1, name='longitude')(x)

encoder = models.Model(audio_input, x)

# Decoder: Maps 2D latent space (latitude, longitude) back to raw audio
latent_input = layers.Input(shape=(64,))  # Input for 2D latent vector
x = layers.Dense(128, activation='relu')(latent_input)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(44100, activation='tanh')(x)  # Output should be of shape (44100,) for raw audio
x = layers.Reshape((44100,))(x)

decoder = models.Model(latent_input, x)

# VAE model: Combine encoder and decoder
encoding = encoder(audio_input)
decoded_audio = decoder(encoding)

vae = models.Model(audio_input, decoded_audio)

# Compile and train the model
vae.compile(optimizer='adam', loss='mean_squared_error')

# Example audio data (batch_size, 44100)
audio_data = np.random.randn(32, 44100)  # 32 audio samples, each of length 44100

vae.fit(y, y, epochs=100)  # Train to reconstruct raw audio

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100


In [56]:
latent_vectors = encoder.predict(y)
latent_vectors



array([[2.2794321e+00, 9.8866418e-02, 0.0000000e+00, ..., 4.0609298e+00,
        5.5978352e-01, 0.0000000e+00],
       [1.1439066e+00, 2.0595675e+00, 0.0000000e+00, ..., 9.8381352e-01,
        2.9910696e-01, 0.0000000e+00],
       [5.6432319e+00, 2.2177808e+00, 0.0000000e+00, ..., 1.9751073e-01,
        2.9049700e-01, 0.0000000e+00],
       ...,
       [2.3500656e-01, 2.2003040e-01, 0.0000000e+00, ..., 6.8368632e-03,
        3.4436703e-02, 6.3412517e-02],
       [3.6616412e-01, 3.8946578e-01, 0.0000000e+00, ..., 0.0000000e+00,
        4.5692548e-05, 1.8286514e-01],
       [1.5054739e-01, 1.2567942e-01, 0.0000000e+00, ..., 2.1402521e-02,
        3.6460686e-02, 1.4780518e-02]], dtype=float32)

In [60]:
mlp_input = layers.Input(shape=(2,)) 
x = layers.Dense(16, activation='relu')(mlp_input)
x = layers.Dense(16, activation='relu')(mlp_input)
x = layers.Dense(64, activation='linear')(x) 
mlp = models.Model(mlp_input, x, name="MLP")

# Compile MLP
mlp.compile(optimizer='adam', loss='mse')

# Train MLP (map lat/long to the latent space)
mlp.fit(X, latent_vectors, epochs=500, batch_size=32)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7ff850102800>

In [64]:
for i in range(100):
    if i == 99:
        verbose = True
    else:
        verbose = False
        
    mlp.fit(X, latent_vectors, epochs=500, batch_size=32, verbose=verbose)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [32]:
from tensorflow.keras.models import load_model

# Load the trained decoder
decoder = load_model('decoder_model.h5')



In [22]:
audio = []
for i in range(-25, 25):
    #for j in range(-5, 5):
    latent_point = np.concatenate([np.array([[i/5]]), np.array([[j/5]])], axis=-1)

    y = decoder.predict(latent_point, verbose=False)[0]
    audio += y.tolist()

In [23]:
import soundfile as sf
sf.write('generated_audio3.wav', audio, 44100)

In [15]:
decoder.save('decoder_model.h5')



In [33]:
decoder.trainable = False

In [34]:
mlp_input = tf.keras.layers.Input(shape=(2,))  # Input: (latitude, longitude)

x = tf.keras.layers.Dense(64, activation='relu')(mlp_input)
x = tf.keras.layers.Dense(128, activation='relu')(x)
x = tf.keras.layers.Dense(2, activation='linear')(x)  # Output: 2D latent vector

mlp_model = tf.keras.models.Model(mlp_input, x, name="MLP")

In [35]:
audio_output = decoder(mlp_model.output)

# Full model: MLP + Decoder
full_model = tf.keras.models.Model(mlp_input, audio_output, name="MLP_Decoder")

# Compile the full model (only MLP is trainable)
full_model.compile(optimizer='adam', loss='mse')

In [58]:
full_model.fit(X, y, epochs=500, batch_size=32)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x7ff8cc16cd30>

In [29]:
la = np.array([[28.59 / 90]])
lo = np.array([[78.96 / 180]])

In [30]:
test_latlong = np.hstack([la, lo])

# Predict the audio using the trained MLP + Decoder
generated_audio = full_model.predict(test_latlong)

# Save or play the generated audio
import soundfile as sf
sf.write('india.wav', generated_audio[0], 44100)



In [24]:
la = np.array([[40.46 / 90]])
lo = np.array([[3.74 / 180]])
test_latlong = np.hstack([la, lo])

# Predict the audio using the trained MLP + Decoder
generated_audio = full_model.predict(test_latlong)

# Save or play the generated audio
import soundfile as sf
sf.write('spain.wav', generated_audio[0], 44100)



In [66]:
test_lat = np.array([[12.97 / 90]])   # Example: Bangalore latitude
test_long = np.array([[77.59 / 180]])  # Example: Bangalore longitude
test_latlong = np.hstack([test_lat, test_long])

# Predict latent vector from MLP
predicted_latent_vector = mlp.predict(test_latlong)

# Generate audio using the decoder
generated_audio = decoder.predict(predicted_latent_vector)

# Save the generated audio
import soundfile as sf
sf.write('banglore.wav', generated_audio[0], 44100)

