---

# Training Auto-encoder

>This notebook contains code for training the auto-encoder with reconstruction loss.
>Audio is cropped to 1 second (44100 samples). ie. the model generates 1 second of audio for each inference.

---

In [1]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import IPython.display as ipd

# Dataset

### Load audio file path-geotag mappings

In [2]:
df = pd.read_csv('../corpus/mappings.csv')

In [3]:
df.head()

Unnamed: 0,path,lat,long
0,../corpus/audio/169884.mp3,0.501047,0.041804
1,../corpus/audio/169885.mp3,0.501052,0.041806
2,../corpus/audio/697381.mp3,0.187898,0.534158
3,../corpus/audio/187893.mp3,0.459852,0.012109
4,../corpus/audio/788102.mp3,0.236748,-0.876991


### Load audio files
- Crop audio to 44100 samples.
- Pad audio to 44100 samples.

In [4]:
audio = list()

for i, row in df.iterrows():
    y, sr = librosa.load(row['path'])

    y = np.pad(y, (0, max(0, 44100 - len(y))), mode="constant")[:44100]

    audio.append(y)

In [5]:
y = np.array(audio)
y.shape

(105, 44100)

## Auto-encoder

### Encoder

In [6]:
# audio_shape = (44100,) for each audio sample
audio_shape = (44100,)

audio_input = layers.Input(shape=audio_shape)

x = layers.Reshape((audio_shape[0], 1))(audio_input)
x = layers.Conv1D(32, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(64, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(64, activation='tanh')(x)

encoder = models.Model(audio_input, x)

### Decoder

In [7]:
latent_input = layers.Input(shape=(64,))
x = layers.Dense(128, activation='relu')(latent_input)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(44100, activation='tanh')(x)
x = layers.Reshape((44100,))(x)

decoder = models.Model(latent_input, x)

### Combine encoder and decoder

In [8]:
encoding = encoder(audio_input)
decoder_output = decoder(encoding)

ae = models.Model(audio_input, decoder_output)

In [9]:
ae.compile(optimizer='adam', loss='mean_squared_error')
ae.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 44100)]           0         
                                                                 
 model (Functional)          (None, 64)                45156608  
                                                                 
 model_1 (Functional)        (None, 44100)             11375044  
                                                                 
Total params: 56,531,652
Trainable params: 56,531,652
Non-trainable params: 0
_________________________________________________________________


### Train model

Train the auto-encoder with reconstruction loss

In [26]:
ae.fit(y, y, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fb1281f5c00>

### Test model

In [27]:
y_test = y[0]

ipd.display(ipd.Audio(data=y_test, rate=44100))

In [28]:
example_vector = encoder.predict(np.array([y_test]))

y_recon = decoder.predict(example_vector, verbose=False)[0]



In [30]:
ipd.display(ipd.Audio(data=y_recon, rate=44100))

### Save models

In [31]:
encoder.save('../models/encoder.h5')
decoder.save('../models/decoder.h5')

