In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile
import tensorflow_io as tfio
import soundfile as sf
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from functools import partial

from data import fetch_metadata,load_wav_for_map, load_wav_16k_mono, bird_mapping_idx_to_name
from yamnet import Yamnet

%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
assert len(tf.config.list_physical_devices('GPU')) >=1 

In [3]:
yamnet_base = Yamnet()
yamnet_class_names = yamnet_base.yamnet_class_names

In [4]:
birdsong_metadata = fetch_metadata()
dataset = birdsong_metadata.copy()

train_audio_paths, valid_audio_paths, train_labels, valid_labels = train_test_split(
    dataset[["filepath"]], dataset["label"], stratify=dataset["label"], test_size=0.2, random_state=101
)

filepaths = pd.concat([train_audio_paths, valid_audio_paths])
folds = pd.Series([0] * len(train_audio_paths) + [1] * len(valid_audio_paths))
targets = pd.concat([train_labels, valid_labels])

main_ds = tf.data.Dataset.from_tensor_slices((filepaths.values.squeeze(), targets.values.squeeze(), folds.values.squeeze()))
main_ds = tf.data.Dataset.from_tensor_slices((filepaths.values.squeeze(), targets.values.squeeze(), folds.values.squeeze()))
main_ds = main_ds.map(load_wav_for_map)

main_ds = main_ds.map(yamnet_base.extract_embedding).unbatch()
cached_ds = main_ds.cache()
train_ds = cached_ds.filter(lambda embedding, label, fold: fold ==0)
val_ds = cached_ds.filter(lambda embedding, label, fold: fold ==1)

remove_fold_column = lambda embedding, label, fold: (embedding, label)

train_ds = train_ds.map(remove_fold_column)
val_ds = val_ds.map(remove_fold_column)

train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.experimental.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [16]:
# def load_wav_16k_mono(filename):
#     """ read in a waveform file and convert to 16 kHz mono """
#     file_contents = tf.io.read_file(filename)
#     wav, sample_rate = tf.audio.decode_wav(
#         file_contents,
#         desired_channels=1)
#     wav = tf.squeeze(wav, axis=-1)
#     sample_rate = tf.cast(sample_rate, dtype=tf.int64)
#     wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
#     return wav

# shapes = []
# for f in tqdm(filepaths.values.squeeze()):
#     try:
#         tensor = load_wav_16k_mono(f)
#         shapes.append(tensor.shape[0])
#     except Exception as e:
#         print(e)
#         print(f)
#         print('error!!')

 44%|████▎     | 8360/19114 [28:26<25:39,  6.98it/s]  

unable to initialize resampler: 3 [Op:IO>AudioResample]
/media/wwymak/Storage2/birdsong_dataset/xeno_canto_eu_cleaned/willow_warbler/465780.wav
error!!


 70%|██████▉   | 13335/19114 [45:10<08:27, 11.39it/s]  

unable to initialize resampler: 3 [Op:IO>AudioResample]
/media/wwymak/Storage2/birdsong_dataset/xeno_canto_eu_cleaned/common_reed_bunting/459261.wav
error!!


 92%|█████████▏| 17548/19114 [1:00:28<03:51,  6.77it/s]

unable to initialize resampler: 3 [Op:IO>AudioResample]
/media/wwymak/Storage2/birdsong_dataset/xeno_canto_eu_cleaned/common_chiffchaff/465779.wav
error!!


 98%|█████████▊| 18813/19114 [1:04:51<01:22,  3.63it/s]

unable to initialize resampler: 3 [Op:IO>AudioResample]
/media/wwymak/Storage2/birdsong_dataset/xeno_canto_eu_cleaned/dunnock/560551.wav
error!!


100%|██████████| 19114/19114 [1:05:48<00:00,  4.84it/s]


In [5]:
ym_finetune_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(birdsong_metadata.label.nunique())
], name='ym_finetune_model')

ym_finetune_model.summary()

Model: "ym_finetune_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               524800    
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 84)                21588     
Total params: 677,716
Trainable params: 677,716
Non-trainable params: 0
_________________________________________________________________


In [6]:
metrics = [
    tf.keras.metrics.SparseTopKCategoricalAccuracy(
        k=5, name="sparse_top_k_categorical_accuracy", dtype=None
    ),
    'accuracy'
]
optimiser = tf.keras.optimizers.Adam(
    learning_rate=5e-4,
)
loss = loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
ym_finetune_model.compile(loss=loss,
                 optimizer=optimiser,
                 metrics=metrics)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        filepath="test_model_all_birds",
        save_best_only=True,  # Only save a model if `val_loss` has improved.
        monitor="val_loss",
        verbose=1,
    ),
    tf.keras.callbacks.TensorBoard(
        log_dir="./logs",
        update_freq="batch",
    )  # How often to write logs (default: once per epoch)
]
history = ym_finetune_model.fit(train_ds,
                       epochs=30,
                       validation_data=val_ds,
                       callbacks=callbacks)

Epoch 1/30
Instructions for updating:
use `tf.profiler.experimental.stop` instead.


Instructions for updating:
use `tf.profiler.experimental.stop` instead.






  80955/Unknown - 9932s 123ms/step - loss: 2.1994 - sparse_top_k_categorical_accuracy: 0.7117 - accuracy: 0.4727
Epoch 00001: val_loss improved from inf to 4.28632, saving model to test_model_all_birds
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: test_model_all_birds/assets


INFO:tensorflow:Assets written to: test_model_all_birds/assets


Epoch 2/30
Epoch 00002: val_loss improved from 4.28632 to 3.90840, saving model to test_model_all_birds
INFO:tensorflow:Assets written to: test_model_all_birds/assets


INFO:tensorflow:Assets written to: test_model_all_birds/assets


Epoch 3/30
Epoch 00003: val_loss improved from 3.90840 to 3.87687, saving model to test_model_all_birds
INFO:tensorflow:Assets written to: test_model_all_birds/assets


INFO:tensorflow:Assets written to: test_model_all_birds/assets


Epoch 4/30
Epoch 00004: val_loss improved from 3.87687 to 3.77116, saving model to test_model_all_birds
INFO:tensorflow:Assets written to: test_model_all_birds/assets


INFO:tensorflow:Assets written to: test_model_all_birds/assets


Epoch 5/30
Epoch 00005: val_loss did not improve from 3.77116
Epoch 6/30
Epoch 00006: val_loss did not improve from 3.77116
Epoch 7/30
Epoch 00007: val_loss did not improve from 3.77116
Epoch 8/30
Epoch 00008: val_loss did not improve from 3.77116
Epoch 9/30
Epoch 00009: val_loss did not improve from 3.77116
Epoch 10/30
Epoch 00010: val_loss did not improve from 3.77116
Epoch 11/30
Epoch 00011: val_loss did not improve from 3.77116
Epoch 12/30
Epoch 00012: val_loss did not improve from 3.77116
Epoch 13/30
Epoch 00013: val_loss did not improve from 3.77116
Epoch 14/30
Epoch 00014: val_loss did not improve from 3.77116
Epoch 15/30
Epoch 00015: val_loss did not improve from 3.77116
Epoch 16/30
Epoch 00016: val_loss improved from 3.77116 to 3.72799, saving model to test_model_all_birds
INFO:tensorflow:Assets written to: test_model_all_birds/assets


INFO:tensorflow:Assets written to: test_model_all_birds/assets


Epoch 17/30
Epoch 00017: val_loss did not improve from 3.72799
Epoch 18/30
Epoch 00018: val_loss did not improve from 3.72799
Epoch 19/30
Epoch 00019: val_loss did not improve from 3.72799
Epoch 20/30
Epoch 00020: val_loss did not improve from 3.72799
Epoch 21/30
Epoch 00021: val_loss did not improve from 3.72799
Epoch 22/30
Epoch 00022: val_loss did not improve from 3.72799
Epoch 23/30
Epoch 00023: val_loss did not improve from 3.72799
Epoch 24/30
Epoch 00024: val_loss did not improve from 3.72799
Epoch 25/30
Epoch 00025: val_loss did not improve from 3.72799
Epoch 26/30
Epoch 00026: val_loss did not improve from 3.72799
Epoch 27/30
Epoch 00027: val_loss did not improve from 3.72799
Epoch 28/30
Epoch 00028: val_loss did not improve from 3.72799
Epoch 29/30
Epoch 00029: val_loss did not improve from 3.72799
Epoch 30/30
Epoch 00030: val_loss did not improve from 3.72799


In [7]:
ym_finetune_model.save('/media/wwymak/Storage2/birdsong_dataset/models/ym_finetune_baseline')

INFO:tensorflow:Assets written to: /media/wwymak/Storage2/birdsong_dataset/models/ym_finetune_baseline/assets


INFO:tensorflow:Assets written to: /media/wwymak/Storage2/birdsong_dataset/models/ym_finetune_baseline/assets


In [3]:
class ReduceMeanLayer(tf.keras.layers.Layer):
    def __init__(self, axis=0, **kwargs):
        super(ReduceMeanLayer, self).__init__(**kwargs)
        self.axis = axis

    def call(self, input):
        return tf.math.reduce_mean(input, axis=self.axis)

In [4]:
class ReduceMeanLayer(tf.keras.layers.Layer):
    
    def call(self, input):
        return tf.math.reduce_mean(input, axis=0)
ym_finetune_model =  tf.keras.models.load_model('models/test_model_all_birds')


saved_model_path = './models/model_all_birds_v1.h5'

input_segment = tf.keras.layers.Input(shape=(), dtype=tf.float32, name='audio')
embedding_extraction_layer = hub.KerasLayer(yamnet_base.model_path,
                                            trainable=False, name='yamnet')
_, embeddings_output, _ = embedding_extraction_layer(input_segment)
serving_outputs = ym_finetune_model(embeddings_output)
serving_outputs = ReduceMeanLayer(name='classifier')(serving_outputs)
# serving_outputs = ReduceMeanLayer(axis=0, name='classifier')(serving_outputs)
serving_model = tf.keras.Model(input_segment, serving_outputs)
serving_model.save(saved_model_path, include_optimizer=False)

In [5]:
import gc
del serving_model
gc.collect()

8138

In [11]:

reloaded_model = tf.keras.models.load_model(saved_model_path, custom_objects={'ReduceMeanLayer': ReduceMeanLayer, 'KerasLayer': hub.KerasLayer})





In [12]:
testing_wav_data = load_wav_16k_mono('/media/wwymak/Storage2/birdsong_dataset/xeno_canto_eu_cleaned/willow_ptarmigan/139115.wav')
results = reloaded_model(testing_wav_data)
# birds = yamnet_class_names[tf.math.top_k(results,k=5)]

In [20]:
Audio(testing_wav_data, rate=16000)

In [14]:
top_probs, top_idxs = tf.math.top_k(results, k=5)
_, most_likely_idx = tf.math.top_k(results, k=1)
top_labels = [bird_mapping_idx_to_name[idx] for idx in top_idxs.numpy()]
top_labels

['common_blackbird',
 'european_green_woodpecker',
 'common_nightingale',
 'willow_ptarmigan',
 'tawny_owl']

In [16]:
bird_mapping_idx_to_name[most_likely_idx.numpy()[0]]

'common_blackbird'

In [18]:
tf.math.top_k??

[0;31mSignature:[0m [0mtf[0m[0;34m.[0m[0mmath[0m[0;34m.[0m[0mtop_k[0m[0;34m([0m[0minput[0m[0;34m,[0m [0mk[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0msorted[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;34m@[0m[0mtf_export[0m[0;34m([0m[0;34m"math.top_k"[0m[0;34m,[0m [0;34m"nn.top_k"[0m[0;34m)[0m[0;34m[0m
[0;34m[0m[0;34m@[0m[0mdispatch[0m[0;34m.[0m[0madd_dispatch_support[0m[0;34m[0m
[0;34m[0m[0;32mdef[0m [0mtop_k[0m[0;34m([0m[0minput[0m[0;34m,[0m [0mk[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m [0msorted[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m  [0;31m# pylint: disable=redefined-builtin[0m[0;34m[0m
[0;34m[0m  [0;34m"""Finds values and indices of the `k` largest entries for the last dimension.[0m
[0;34m[0m
[0;34m  If the input is a vector (rank=1), finds the

In [22]:
sorted(yamnet_class_names)

['A capella',
 'Accelerating, revving, vroom',
 'Accordion',
 'Acoustic guitar',
 'Afrobeat',
 'Air brake',
 'Air conditioning',
 'Air horn, truck horn',
 'Aircraft',
 'Aircraft engine',
 'Alarm',
 'Alarm clock',
 'Ambient music',
 'Ambulance (siren)',
 'Angry music',
 'Animal',
 'Applause',
 'Arrow',
 'Artillery fire',
 'Babbling',
 'Baby cry, infant cry',
 'Baby laughter',
 'Background music',
 'Bagpipes',
 'Bang',
 'Banjo',
 'Bark',
 'Basketball bounce',
 'Bass drum',
 'Bass guitar',
 'Bathtub (filling or washing)',
 'Beatboxing',
 'Bee, wasp, etc.',
 'Beep, bleep',
 'Bell',
 'Bellow',
 'Belly laugh',
 'Bicycle',
 'Bicycle bell',
 'Bird',
 'Bird flight, flapping wings',
 'Bird vocalization, bird call, bird song',
 'Biting',
 'Bleat',
 'Blender',
 'Bluegrass',
 'Blues',
 'Boat, Water vehicle',
 'Boiling',
 'Boing',
 'Boom',
 'Bouncing',
 'Bow-wow',
 'Bowed string instrument',
 'Brass instrument',
 'Breaking',
 'Breathing',
 'Burping, eructation',
 'Burst, pop',
 'Bus',
 'Busy signal'