---

# Training Multilayer perception (Geotag - Latent space mapping)

---

In [1]:
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras import layers, models
import IPython.display as ipd
from tensorflow.keras.models import load_model

## Dataset

### Load audio file path-geotag mappings

In [2]:
df = pd.read_csv('../corpus/mappings.csv')

In [3]:
df.head()

Unnamed: 0,path,lat,long
0,../corpus/audio/169884.mp3,0.501047,0.041804
1,../corpus/audio/169885.mp3,0.501052,0.041806
2,../corpus/audio/697381.mp3,0.187898,0.534158
3,../corpus/audio/187893.mp3,0.459852,0.012109
4,../corpus/audio/788102.mp3,0.236748,-0.876991


### Load audio files
- Crop audio to 44100 samples.
- Pad audio to 44100 samples.

In [4]:
audio = list()

for i, row in df.iterrows():
    y, sr = librosa.load(row['path'])

    y = np.pad(y, (0, max(0, 44100 - len(y))), mode="constant")[:44100]

    audio.append(y)

In [5]:
y = np.array(audio)
y.shape

(105, 44100)

### Get input vectors

In [6]:
X = df[['lat', 'long']].to_numpy()

## Get geotag-latent mapping

In [7]:
encoder = load_model('../models/encoder.h5')



In [8]:
latents = encoder.predict(y)



## Multilayer perceptron

In [38]:
mlp = tf.keras.Sequential()
mlp.add(layers.Input(shape=(2,)))
mlp.add(layers.Dense(64, activation='relu'))
mlp.add(layers.Dense(128, activation='relu'))
mlp.add(layers.Dense(256, activation='relu'))
mlp.add(layers.Dense(128, activation='relu'))
mlp.add(layers.Dense(64, activation='relu'))
mlp.add(layers.Dense(64, activation='tanh'))

In [39]:
mlp.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_13 (Dense)            (None, 64)                192       
                                                                 
 dense_14 (Dense)            (None, 128)               8320      
                                                                 
 dense_15 (Dense)            (None, 256)               33024     
                                                                 
 dense_16 (Dense)            (None, 128)               32896     
                                                                 
 dense_17 (Dense)            (None, 64)                8256      
                                                                 
 dense_18 (Dense)            (None, 64)                4160      
                                                                 
Total params: 86,848
Trainable params: 86,848
Non-trai

In [40]:
mlp.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='mean_squared_error')

In [45]:
mlp.fit(X, latents, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f8a305a8a00>

## Test

In [49]:
decoder = load_model('../models/decoder.h5')



In [50]:
example_geotag = np.array([X[0]])

In [51]:
latent = mlp.predict(example_geotag)
audio = decoder.predict(latent)



In [52]:
ipd.display(ipd.Audio(data=audio, rate=44100))

In [54]:
mlp.save('../models/mlp.h5')