---

# Training Auto-encoder

>This notebook contains code for training the auto-encoder with reconstruction loss.
>Audio is cropped to 1 second (44100 samples). ie. the model generates 1 second of audio for each inference.

---

In [1]:
import pandas as pd
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import IPython.display as ipd

# Dataset

### Load audio file path-geotag mappings

In [2]:
df = pd.read_csv('../corpus/mappings.csv')

In [3]:
df.head()

Unnamed: 0,path,lat,long
0,../corpus/audio/169884.mp3,0.501047,0.041804
1,../corpus/audio/169885.mp3,0.501052,0.041806
2,../corpus/audio/697381.mp3,0.187898,0.534158
3,../corpus/audio/187893.mp3,0.459852,0.012109
4,../corpus/audio/788102.mp3,0.236748,-0.876991


### Load audio files
- Crop audio to 44100 samples.
- Pad audio to 44100 samples.

In [4]:
audio = list()

for i, row in df.iterrows():
    y, sr = librosa.load(row['path'])

    y = np.pad(y, (0, max(0, 44100 - len(y))), mode="constant")[:44100]

    audio.append(y)

In [5]:
y = np.array(audio)
y.shape

(105, 44100)

## Auto-encoder

### Encoder

In [51]:
class Sampling(layers.Layer):
  def call(self, inputs):
    mean, log_var = inputs
    return mean + tf.math.exp(0.5 * log_var) * tf.random.normal(shape = (tf.shape(mean)[0], tf.shape(mean)[1])) 

In [81]:
audio_shape = (44100,)

audio_input = layers.Input(shape=audio_shape)

x = layers.Reshape((audio_shape[0], 1))(audio_input)
x = layers.Conv1D(32, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Conv1D(64, 3, activation='relu')(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(LATENT_DIM, activation='relu')(x)

mean = layers.Dense(64,)(x)
log_var = layers.Dense(64,)(x)

z = Sampling()([mean, log_var])

encoder = models.Model(audio_input, [z, mean, log_var])

encoder.summary()

Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_27 (InputLayer)          [(None, 44100)]      0           []                               
                                                                                                  
 reshape_18 (Reshape)           (None, 44100, 1)     0           ['input_27[0][0]']               
                                                                                                  
 conv1d_20 (Conv1D)             (None, 44098, 32)    128         ['reshape_18[0][0]']             
                                                                                                  
 max_pooling1d_20 (MaxPooling1D  (None, 22049, 32)   0           ['conv1d_20[0][0]']              
 )                                                                                         

### Decoder

In [82]:
latent_input = layers.Input(shape=(64,))
x = layers.Dense(128, activation='relu')(latent_input)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dense(44100, activation='tanh')(x)
x = layers.Reshape((44100,))(x)

decoder = models.Model(latent_input, x)

decoder.summary()

Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_28 (InputLayer)       [(None, 64)]              0         
                                                                 
 dense_54 (Dense)            (None, 128)               8320      
                                                                 
 dense_55 (Dense)            (None, 256)               33024     
                                                                 
 dense_56 (Dense)            (None, 44100)             11333700  
                                                                 
 reshape_19 (Reshape)        (None, 44100)             0         
                                                                 
Total params: 11,375,044
Trainable params: 11,375,044
Non-trainable params: 0
_________________________________________________________________


### Combine encoder and decoder

In [83]:
vae_input = layers.Input(shape=audio_shape)

z, mean, log_var = encoder(vae_input)
output = decoder(z)

vae = models.Model(vae_input, output)

vae.summary()

Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_29 (InputLayer)       [(None, 44100)]           0         
                                                                 
 model_19 (Functional)       [(None, 64),              45164928  
                              (None, 64),                        
                              (None, 64)]                        
                                                                 
 model_20 (Functional)       (None, 44100)             11375044  
                                                                 
Total params: 56,539,972
Trainable params: 56,539,972
Non-trainable params: 0
_________________________________________________________________


### Train model

Train the auto-encoder with reconstruction loss

In [84]:
def vae_loss(y_true, y_pred, mean, log_var):
    # Reconstruction loss using Mean Squared Error (MSE)
    loss_rec = tf.reduce_mean(tf.reduce_sum(tf.square(y_true - y_pred), axis=1))  

    # KL divergence regularization term for Variational Autoencoder (VAE)
    loss_reg = -0.5 * (1 + log_var - tf.square(mean) - tf.exp(log_var))
    loss_reg = tf.reduce_mean(tf.reduce_sum(loss_reg, axis=1))

    return loss_rec + loss_reg

In [85]:
vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss=vae_loss)

In [86]:
@tf.function
def training_block(y):
    with tf.GradientTape() as recorder:
        z, mean, log_var = encoder(y)
        y_pred = decoder(z)
        y_true = y
        loss = vae_loss(y_true,y_pred, mean, log_var)
    
    partial_derivatives = recorder.gradient(loss, vae.trainable_weights)
    OPTIMIZER.apply_gradients(zip(partial_derivatives, vae.trainable_weights))
    
    return loss

In [87]:
def train(epochs):
    for epoch in range(epochs):
        loss = training_block(y)

        if not (epoch + 1) % 10:
            print('Epoch: {} Loss: {}'.format(epoch + 1, loss))

In [88]:
train(500)

Epoch: 10 Loss: 170.60691833496094
Epoch: 20 Loss: 151.23997497558594
Epoch: 30 Loss: 138.16128540039062
Epoch: 40 Loss: 125.87285614013672
Epoch: 50 Loss: 124.82721710205078
Epoch: 60 Loss: 118.64783477783203
Epoch: 70 Loss: 115.10376739501953
Epoch: 80 Loss: 110.66278839111328
Epoch: 90 Loss: 110.56299591064453
Epoch: 100 Loss: 108.45604705810547
Epoch: 110 Loss: 108.53882598876953
Epoch: 120 Loss: 108.39986419677734
Epoch: 130 Loss: 108.38907623291016
Epoch: 140 Loss: 108.71841430664062
Epoch: 150 Loss: 108.51921081542969
Epoch: 160 Loss: 107.92121887207031
Epoch: 170 Loss: 107.97645568847656
Epoch: 180 Loss: 108.18423461914062
Epoch: 190 Loss: 108.53057098388672
Epoch: 200 Loss: 108.09125518798828
Epoch: 210 Loss: 108.6771011352539
Epoch: 220 Loss: 107.97303009033203
Epoch: 230 Loss: 107.92562103271484
Epoch: 240 Loss: 108.8448715209961
Epoch: 250 Loss: 108.30684661865234
Epoch: 260 Loss: 108.11012268066406
Epoch: 270 Loss: 107.7735824584961
Epoch: 280 Loss: 108.13029479980469
Epoc

### Test model

In [92]:
y_test = y[0]

ipd.display(ipd.Audio(data=y_test, rate=44100))

In [93]:
z, _, _ = encoder.predict(np.array([y_test]))

y_recon = decoder.predict(z, verbose=False)[0]



In [95]:
ipd.display(ipd.Audio(data=y_recon, rate=44100))

### Save models

In [96]:
encoder.save('../models/encoder.h5')
decoder.save('../models/decoder.h5')



## Saving latent vectors

In [97]:
latents, _, _ = encoder.predict(y)



In [98]:
np.save('../models/latents.npy', latents)