In [1]:
import os

from IPython import display
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import csv
from functools import reduce

import tensorflow as tf
import tensorflow_hub as tfhub
import tensorflow_io as tfio
import tensorflow_model_optimization as tfmot

import params as yamnet_params
import yamnet as yamnet_model

In [2]:
# load in the yamnet model
params = yamnet_params.Params()
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('yamnet.h5')
yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv')

# Build representative dataset

In [3]:
# Utility function for loading audio files and making sure the sample rate is correct.
@tf.function
def load_wav_16k_mono(filename):
      """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio, and frame it to 15600 samples. """
      file_contents = tf.io.read_file(filename)
      wav, sample_rate = tf.audio.decode_wav(
            file_contents, 
            desired_channels=1
      )
      wav = tf.squeeze(wav, axis=-1)
      sample_rate = tf.cast(sample_rate, dtype=tf.int64)
      wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
      return wav
      # print(wav.shape)
      frames = tf.signal.frame(wav, 15600, 15600)
      # print(frames.shape)
      return frames

@tf.function
def frame_16k_mono(filename):
      wav = load_wav_16k_mono(filename)
      frames = tf.signal.frame(wav, 15600, 15600)
      return frames

In [4]:
# read in esc-50 descriptive data
esc50_csv = './datasets/ESC-50-master/meta/esc50.csv'
base_data_path = './datasets/ESC-50-master/audio/'

pd_data = pd.read_csv(esc50_csv)
pd_data.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


In [5]:
# filter descriptive data to cat and dog class
my_classes = ['dog', 'cat']
map_class_to_id = {'dog':0, 'cat':1}

filtered_pd = pd_data[pd_data.category.isin(my_classes)]
class_id = filtered_pd['category'].apply(lambda name: map_class_to_id[name])
filtered_pd = filtered_pd.assign(target=class_id)
full_path = filtered_pd['filename'].apply(lambda row: os.path.join(base_data_path, row))
filtered_pd = filtered_pd.assign(filename=full_path)
filtered_pd.head(10)

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,./datasets/ESC-50-master/audio/1-100032-A-0.wav,1,0,dog,True,100032,A
14,./datasets/ESC-50-master/audio/1-110389-A-0.wav,1,0,dog,True,110389,A
157,./datasets/ESC-50-master/audio/1-30226-A-0.wav,1,0,dog,True,30226,A
158,./datasets/ESC-50-master/audio/1-30344-A-0.wav,1,0,dog,True,30344,A
170,./datasets/ESC-50-master/audio/1-32318-A-0.wav,1,0,dog,True,32318,A
175,./datasets/ESC-50-master/audio/1-34094-A-5.wav,1,1,cat,False,34094,A
176,./datasets/ESC-50-master/audio/1-34094-B-5.wav,1,1,cat,False,34094,B
229,./datasets/ESC-50-master/audio/1-47819-A-5.wav,1,1,cat,False,47819,A
230,./datasets/ESC-50-master/audio/1-47819-B-5.wav,1,1,cat,False,47819,B
231,./datasets/ESC-50-master/audio/1-47819-C-5.wav,1,1,cat,False,47819,C


In [6]:
# create dataset from the descriptive data
filenames = filtered_pd['filename']
targets = filtered_pd['target']
folds = filtered_pd['fold']

main_ds = tf.data.Dataset.from_tensor_slices((filenames, targets, folds))

In [7]:
# helper function to load audio data from descriptive data when mapped
def load_frames_for_map(filename, label, fold):
    frames = frame_16k_mono(filename)
    return (
        frames,
        label,
        fold
    )

# loads audio data in place of descriptive data
main_ds = main_ds.map(load_frames_for_map)



In [8]:
# unbatch frames
def unbatch_frames(frames, label, fold):
    # num_frames = reduce((lambda x, y: x* y), frames.shape[0:-1])
    num_frames = 5
    frames = tf.reshape(frames,[num_frames, 15600])
    return (
        frames, 
        tf.repeat(label, num_frames),
        tf.repeat(fold, num_frames)
    )
    
main_ds = main_ds.map(unbatch_frames).unbatch()

In [None]:
# applies the embedding extraction model to a wav data
def extract_embedding(frame, label, fold):
    ''' run YAMNet to extract embedding from the wav data '''
    scores, embeddings, spectrogram = yamnet(frame)
    num_embeddings = tf.shape(embeddings)[0]
    return (
        embeddings,
        tf.repeat(label, num_embeddings),
        tf.repeat(fold, num_embeddings)
    )

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()

In [9]:
# split the data
cached_ds = main_ds.cache()
train_ds = cached_ds.filter(lambda embedding, label, fold: fold < 4)

# remove folds column
remove_fold_column = lambda embedding, label, fold: (embedding, label)
train_ds = train_ds.map(remove_fold_column)

In [10]:
def representative_dataset():
    for frame, label in train_ds.take(100):
        yield [frame]

In [11]:
# convert the model to a quantized tflite model
converter = tf.lite.TFLiteConverter.from_keras_model(yamnet)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8  # or tf.uint8
converter.inference_output_type = tf.int8  # or tf.uint8
tflite_quant_model = converter.convert()






In [12]:
# Save the model.
with open('models\me\yamnet\quant_test.tflite', 'wb') as f:
  f.write(tflite_quant_model)
