In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow_io

Collecting tensorflow_io
  Downloading tensorflow_io-0.36.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_io
Successfully installed tensorflow_io-0.36.0


In [3]:
from typing import Tuple, Optional

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio

import numpy as np
import pandas as pd

import os
import sys
import tarfile
import hashlib
import re
import glob

import random
import math

import IPython.display as ipd
from tensorflow.python.util import compat

In [4]:
LIB_PATH = '/content/drive/MyDrive/GSC/GSC_helper'
sys.path.append(LIB_PATH)
from utils import _download

## GSC

In [5]:
DATA_URL = ['http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
            'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz']

OFFICIAL_TEST_URL = ['http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz',
                     'http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz']

WORDS = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']

MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
RANDOM_SEED = 59185
SR = 16000

def prepare_words_list(wanted_words: list) -> list:
    """Prepends common tokens to the custom word list.

    Args:
        wanted_words: List of strings containing the custom words.

    Returns:
        List with the standard silence and unknown tokens added.
    """
    return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words

def which_set(filename: str,
              validation_percentage: int,
              testing_percentage: int) -> str:
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
        filename: File path of the data sample.
        validation_percentage: How much of the data set to use for validation.
        testing_percentage: How much of the data set to use for testing.

    Returns:
        String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                      (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

def prepare_data_index(data_dir: list,
                       silence_percentage: int,
                       unknown_percentage: int,
                       wanted_words:int,
                       validation_percentage: int,
                       testing_percentage: int) -> Tuple[dict, dict]:
    """Prepares a list of the samples organized by set and label.

    The training loop needs a list of all the available data, organized by
    which partition it should belong to, and with ground truth labels attached.
    This function analyzes the folders below the `data_dir`, figures out the
    right
    labels for each file based on the name of the subdirectory it belongs to,
    and uses a stable hash to assign it to a data set partition.

    Args:
      silence_percentage: How much of the resulting data should be background.
      unknown_percentage: How much should be audio outside the wanted classes.
      wanted_words: Labels of the classes we want to be able to recognize.
      validation_percentage: How much of the data set to use for validation.
      testing_percentage: How much of the data set to use for testing.

    Returns:
      Dictionary containing a list of file information for each set partition,
      and a lookup map for each class to determine its numeric index.

    Raises:
      Exception: If expected files are not found.
    """
    # Make sure the shuffling and picking of unknowns is deterministic.
    random.seed(RANDOM_SEED)
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
        wanted_words_index[wanted_word] = index + 2

    data_index = {'validation': [], 'testing': [], 'training': []}
    unknown_index = {'validation': [], 'testing': [], 'training': []}
    all_words = {}

    # Look through all the subfolders to find audio samples
    search_path = glob.glob(os.path.join(data_dir, '*', '*.wav'))
    for wav_path in search_path:
        _, word = os.path.split(os.path.dirname(wav_path))
        word = word.lower()
        # Treat the '_background_noise_' folder as a special case, since we expect
        # it to contain long audio samples we mix in to improve training.
        if word == BACKGROUND_NOISE_DIR_NAME:
            continue
        all_words[word] = True
        set_index = which_set(wav_path, validation_percentage, testing_percentage)
        # If it's a known class, store its detail, otherwise add it to the list
        # we'll use to train the unknown label.
        if word in wanted_words_index:
            data_index[set_index].append({'label': word, 'file': wav_path})
        else:
            unknown_index[set_index].append({'label': word, 'file': wav_path})

    if not all_words:
        raise Exception('No .wavs found at ' + search_path)

    for index, wanted_word in enumerate(wanted_words):
        if wanted_word not in all_words:
            raise Exception('Expected to find ' + wanted_word +
                        ' in labels but only found ' +
                        ', '.join(all_words.keys()))

    # We need an arbitrary file to load as the input for the silence samples.
    # It's multiplied by zero later, so the content doesn't matter.
    silence_wav_path = data_index['training'][0]['file']
    for set_index in ['validation', 'testing', 'training']:
        set_size = len(data_index[set_index])
        silence_size = int(math.ceil(set_size * silence_percentage / 100))
        for _ in range(silence_size):
            data_index[set_index].append({
                'label': SILENCE_LABEL,
              'file': silence_wav_path
            })

      # Pick some unknowns to add to each partition of the data set.
        random.shuffle(unknown_index[set_index])
        unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
        data_index[set_index].extend(unknown_index[set_index][:unknown_size])

    # Make sure the ordering is random.
    for set_index in ['validation', 'testing', 'training']:
        random.shuffle(data_index[set_index])

    # Prepare the rest of the result data structure.
    words_list = prepare_words_list(wanted_words)
    word_to_index = {}
    for word in all_words:
        if word in wanted_words_index:
            word_to_index[word] = wanted_words_index[word]
        else:
            word_to_index[word] = UNKNOWN_WORD_INDEX
    word_to_index[SILENCE_LABEL] = SILENCE_INDEX

    return data_index, word_to_index

def prepare_official_test(data_dir: str,
                          wanted_words: list) -> Tuple[list, dict]:
    """
    In case of using the companion for evaluation. We also need to prepare it like we did which makes sure that
    everything will be synchronized.
    Args:
    data_dir: str
        Data directory
    wanted_words: list
    """
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
        wanted_words_index[wanted_word] = index + 2
    wanted_words_index[SILENCE_LABEL] = SILENCE_INDEX
    wanted_words_index[UNKNOWN_WORD_LABEL] = UNKNOWN_WORD_INDEX

    test_data = []

    search_path = glob.glob(os.path.join(data_dir, '*', '*.wav'))
    for wav_path in search_path:
        _, word = os.path.split(os.path.dirname(wav_path))
        word = word.lower()
        test_data.append({'label': word, 'file': wav_path})

    return test_data, wanted_words_index

In [6]:
class SpeechCommands12(keras.utils.Sequence):
    """
    This Dataset is equivalent to SPEECHCOMMANDS Dataset of Pytorch in the way of using.
    All the set up was based on the original paper, which you can find here:
    <https://arxiv.org/pdf/1804.03209.pdf>
    In the section 7, the authors gave us the implementation for GSC 12 with 10 keywords
    ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes'] and 2 additional keywords
    are '_silence_' and '_unknown_', which can be found here:
    <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/input_data.py#L188>

    Our implementation almost based on this sample implementation, so you may find some familar codes here!

    Args
    root: str
        Default directory for using and downloading data.
    download: bool
        Whether download the file from our given url.
    version: int
        Version of Google Speech Commands dataset, includes [1, 2]
    subset: str
        Select a subset of the dataset ['training', 'validation', 'testing', 'official_testing']
    transform:
        Data transformation.
    """

    def __init__(self,
                 root: str,
                 download: bool = True,
                 version: int = 2,
                 subset: str = 'training',
                 transform = None,
                 batch_size: int = 1,
                 shuffle: bool = True) -> None:
        super().__init__()
        self.transform = transform

        if subset != 'official_testing':
            if download:
                url = DATA_URL[version-1]
                filename = os.path.split(url)[-1]
                print('>> Downloading %s' % filename)
                _download(url, root)
            data_index, self.word_to_index = prepare_data_index(root,
                                                                silence_percentage = 10,
                                                                unknown_percentage = 10,
                                                                wanted_words = WORDS,
                                                                validation_percentage = 10,
                                                                testing_percentage = 10)
            self.dataset = data_index[subset]
        else:
            if download:
                url = OFFICIAL_TEST_URL[version-1]
                filename = os.path.split(url)[-1]
                print('>> Downloading %s' % filename)
                _download(url, root)
            self.dataset, self.word_to_index = prepare_official_test(root,
                                                                     wanted_words = WORDS)
        self.subset = subset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the numbber of batches per epoch'
        return int(np.floor(len(self.dataset)/self.batch_size))

    def __getitem__(self, index):
        'Generate on batch of data'
        # Generate indexes of the batcch
        indexes = self.indexes[index*self.batch_size: (index+1)*self.batch_size]

        # Generate data
        X, y = self.__data_generation(indexes)

        return X, y

    def on_epoch_end(self):
        'Undates indexes after each epoch'
        self.indexes = np.arange(len(self.dataset))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, indexes):
        'Generates data containing batch_size samples' # X: (n_samples, *dim, n_channels)
        # Initialization
        X = []
        y = []

        # Generate data
        for i in indexes:
            # Store sample
            row = self.dataset[i]
            filepath = row['file']
            label = row['label']
            if label == SILENCE_LABEL:
                wav = tf.zeros((SR))
            else:
                file_contents = tf.io.read_file(row['file'])
                wav, sr = tf.audio.decode_wav(
                    file_contents,
                    desired_channels = 1
                )
                wav = tf.squeeze(wav, axis = -1)
                #sr = tf.cast(sr, dtype = tf.int64)
                #wav = tfio.audio.resample(wav, rate_in = sr, rate_out = 16000)
            if self.transform:
                wav = self.transform(wav, label)
            X.append(wav)
            # Store class
            y.append(self.word_to_index[row['label']])
        X = tf.stack(X, axis = 0)
        y = tf.stack(y, axis = 0)
        return X, y

In [7]:
def wav_pad(waveform, sr):
    length = len(waveform)
    if length < sr:
        buff = np.zeros(sr)
        buff[:length] = waveform
    elif length > sr:
        buff = waveform[:sr]
    else:
        return waveform
    return buff

In [8]:
def normalizeNoise(wav,
                   noise,
                   max_length: int = 16000):
    len_wav = len(wav)
    len_noise = len(noise)
    if len_wav > len_noise:
        buf = np.zeros_like(wav)
        start_point = int((len_wav - len_noise)*random.uniform(0, 1))
        end_point = start_point + len_noise
        buf[start_point: end_point] = noise
        noise = buf
    elif len_wav < len_noise:
        start_point = int((len_noise - len_wav)*random.uniform(0, 1))
        end_point = start_point + len_wav
        noise = noise[start_point: end_point]
    return noise

In [9]:
def add_noise(waveform, noise):
    noise = normalizeNoise(waveform, noise)
    p = random.uniform(0, 0.5)
    return np.clip(waveform + p*noise, -1, 1)

In [10]:
def time_shift(wav,
               shift: list,
               sr: int = 16000):
    x_shift = int(random.uniform(*shift)*sr)
    padding = np.zeros(np.abs(x_shift))
    if x_shift < 0:
        wav = np.concatenate([padding, wav[:x_shift]], axis = 0)
    else:
        wav = np.concatenate([wav[x_shift:], padding], axis = 0)
    return wav

In [11]:
class Preprocessing:
    def __init__(self,
                 noise_dir: str,
                 noise_prob: float,
                 shift: list = None,
                 is_train: bool = False,
                 augment: bool = True,
                 transform = None) -> None:
        self.noise_paths = glob.glob(os.path.join(noise_dir, '*.wav'))
        self.is_train = is_train
        self.noise_prob = noise_prob
        self.augment = augment
        self.transform = transform
        self.add_noise = lambda x, noise: add_noise(x, noise)
        self.pad_trunc = lambda x: wav_pad(x, SR)
        self.shift = shift
        if shift:
            self.time_shift = lambda x: time_shift(x, shift)

    def __call__(self,
                 wav,
                 label: str):
        # padding to SR
        wav = self.pad_trunc(wav)

        if self.augment:
            # time shifting for training
            if self.is_train:
                if self.shift:
                    wav = self.time_shift(wav)

            p = random.random()
            if label == SILENCE_LABEL or (self.is_train and p<= self.noise_prob):
                noise_contents = tf.io.read_file(random.choice(self.noise_paths))
                noise, _ = tf.audio.decode_wav(noise_contents,
                                               desired_channels = -1)
                noise = tf.squeeze(noise, axis = -1)
                if label == SILENCE_LABEL:
                    p = random.random()
                    wav = normalizeNoise(wav, noise*p)
                else:
                    wav = self.add_noise(wav, noise)

        if self.transform:
            wav = self.transform(wav)

        return wav

In [None]:
f_pre = lambda is_train, augment: Preprocessing(noise_dir = '/content/GSC_12/_background_noise_',
                                                noise_prob = 0.8,
                                                shift  = [-0.1, 0.1],
                                                is_train = is_train,
                                                augment = augment,
                                                )
train_pre = f_pre(True, True)
val_pre = f_pre(False, True)
test_pre = f_pre(False, False)

In [21]:
train_dataloader = SpeechCommands12('/content/GSC_12', download = False, subset = 'training', batch_size = 100, shuffle = True, transform = lambda x, _: wav_pad(x, SR))
val_dataloader = SpeechCommands12('/content/GSC_12', download = False, subset = 'validation', batch_size = 100, shuffle = False, transform = lambda x, _: wav_pad(x, SR))
test_dataloader = SpeechCommands12('/content/GSC_12_test', download = False, subset = 'official_testing', batch_size = 10, shuffle = False, transform = lambda x, _: wav_pad(x, SR))

## Test Audio

In [None]:
def load_wav_16k_mono(filename):
    """
    Load a WAV file, convert it to a float tensor, resample to 16kHZ single-channel audio.
    """
    file_contents = tf.io.read_file(filename)
    wav, sr = tf.audio.decode_wav(
        file_contents,
        desired_channels = 1
    )
    wav = tf.squeeze(wav, axis = -1)
    sr = tf.cast(sr, dtype = tf.int64)
    wav = tfio.audio.resample(wav, rate_in = sr, rate_out = 16000)
    return wav

In [None]:
wavv = load_wav_16k_mono('/content/GSC_12_test/left/022cd682_nohash_0.wav')
wavv = wavv[:17000]

In [None]:
import IPython.display as ipd

In [None]:
ipd.Audio(wavv, rate = 16000)

In [None]:
ipd.Audio(train_pre(wavv, 0).shape, rate = 16000)

In [None]:
noise = load_wav_16k_mono('/content/GSC_12/_background_noise_/doing_the_dishes.wav')

In [None]:
added_noise = add_noise(wavv, noise)
ipd.Audio(added_noise, rate = SR)

In [None]:
time_shifted = time_shift(wavv, [-0.1, 0.1])
ipd.Audio(time_shifted, rate = SR)

In [None]:
train_pred = train_pre(wavv, 'left')
ipd.Audio(train_pred, rate = SR)

In [None]:
y

<tf.Tensor: shape=(100,), dtype=int32, numpy=
array([ 7,  5,  1, 11,  2, 10,  9,  6, 10,  7,  6,  1,  1,  8,  5,  2,  7,
        1,  2,  9,  6, 10,  5, 10,  1, 10,  6,  6,  4,  9,  9,  8, 11,  6,
        3,  6,  8,  8,  9,  7,  8,  7,  4, 11,  6,  2,  0, 11,  8, 11,  4,
        8,  0,  6,  7,  6,  7,  4,  1,  7,  4,  6,  9,  0,  6,  3, 11,  0,
        7,  7, 10,  0,  0, 11,  4,  2,  1,  7,  8,  6, 10,  4,  5, 11, 10,
       10,  8,  8,  5,  1,  6,  3,  8,  3, 10,  7,  2,  0,  2,  4],
      dtype=int32)>

## Test Training E2E model

In [13]:
class Spectrogram(keras.Model):
    def __init__(self,
                 sample_rate: int = 16000,
                 n_fft: int = 400,
                 win_length: Optional[int] = None,
                 hop_length: Optional[int] = None,
                 pad_end: bool = False,
                 power: float = 2.0) -> None:
        super().__init__()
        self.sample_rate = sample_rate
        self.n_fft = n_fft
        self.win_length = win_length if win_length is not None else n_fft
        self.hop_length = hop_length if hop_length is not None else win_length//2
        self.pad_end = pad_end
        self.power = power

    def call(self, waveform: tf.Tensor) -> tf.Tensor:
        spectrogram = tf.abs(tf.signal.stft(
                signals = waveform,
                frame_length = self.win_length,
                frame_step = self.hop_length,
                fft_length = self.n_fft,
                pad_end = self.pad_end
            ))
        if self.power == 2:
            spectrogram = spectrogram*spectrogram
        return spectrogram

In [14]:
class MelSpectrogram(keras.Model):
    def __init__(self,
                 sample_rate: int = 16000,
                 n_fft: int = 400,
                 win_length: Optional[int] = None,
                 hop_length: Optional[int] = 160,
                 f_min: float = 0.0,
                 f_max: float = 3800,
                 pad_end: bool = False,
                 n_mels: int = 128,
                 power: float = 2.0,
                 power_to_db: bool = True) -> None:
          super().__init__()
          num_spectrogram_bins = n_fft//2+1
          self.spec = Spectrogram(sample_rate,
                                  n_fft,
                                  win_length,
                                  hop_length,
                                  pad_end,
                                  power)
          self.linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
              num_mel_bins = n_mels,
              num_spectrogram_bins = num_spectrogram_bins,
              sample_rate = sample_rate,
              lower_edge_hertz = f_min,
              upper_edge_hertz = f_max
            )
          self.power_to_db = power_to_db

    def call(self, waveform: tf.Tensor) -> tf.Tensor:
        spectrogram = self.spec(waveform)
        mel_spectrogram = tf.matmul(spectrogram, self.linear_to_mel_weight_matrix)
        if self.power_to_db:
            # Log mel spectrogram
            log_offset = 1e-6
            mel_spectrogram = tf.math.log(mel_spectrogram + log_offset)
        return mel_spectrogram

In [15]:
model = keras.models.Sequential([
    keras.layers.Input(shape = (16000)),
    MelSpectrogram(16000,
                       512,
                       480,
                       160,
                       pad_end = True,
                       n_mels = 40,
                       power = 1,
                       power_to_db = True),
    # Downsample the input.
    keras.layers.Reshape((100, 40, 1)),
    keras.layers.Conv2D(3, 5, padding = 'same'),
    keras.layers.BatchNormalization(), keras.layers.ReLU(),
    keras.layers.Conv2D(3, 3, strides = 2, padding = 'same', groups = 3),
    keras.layers.Conv2D(32, kernel_size = 1),
    keras.layers.BatchNormalization(), keras.layers.ReLU(),
    keras.layers.Conv2D(32, 32, strides = 2, padding = 'same', groups = 32),
    keras.layers.Conv2D(64, kernel_size = 1),
    keras.layers.BatchNormalization(), keras.layers.ReLU(),
    keras.layers.AveragePooling2D(),
    keras.layers.Flatten(),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(12)
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mel_spectrogram (MelSpectr  (None, 100, 40)           0         
 ogram)                                                          
                                                                 
 reshape (Reshape)           (None, 100, 40, 1)        0         
                                                                 
 conv2d (Conv2D)             (None, 100, 40, 3)        78        
                                                                 
 batch_normalization (Batch  (None, 100, 40, 3)        12        
 Normalization)                                                  
                                                                 
 re_lu (ReLU)                (None, 100, 40, 3)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 50, 20, 3)         3

In [16]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate = 0.0015),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [22]:
EPOCHS = 5
history = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
model.evaluate(test_dataloader)



[0.45065000653266907, 0.8511247634887695]

## BCResNet1 E2E

In [None]:
from BCResNet_tf import BCResNet

In [None]:
model = keras.models.Sequential([
    keras.layers.Input(shape = (16000)),
    MelSpectrogram(16000,
                       512,
                       480,
                       160,
                       pad_end = True,
                       n_mels = 40,
                       power = 1,
                       power_to_db = True),
    # Downsample the input.
    keras.layers.Permute((2, 1)),
    keras.layers.Reshape((40, 100, 1)),
    BCResNet(1, 12, False, 1.5)
])

model.summary()

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mel_spectrogram (MelSpectr  (None, 100, 40)           0         
 ogram)                                                          
                                                                 
 permute (Permute)           (None, 40, 100)           0         
                                                                 
 reshape (Reshape)           (None, 40, 100, 1)        0         
                                                                 
 bc_res_net (BCResNet)       (None, 12)                20058     
                                                                 
Total params: 20058 (78.35 KB)
Trainable params: 16794 (65.60 KB)
Non-trainable params: 3264 (12.75 KB)
_________________________________________________________________


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)

In [None]:
EPOCHS = 5
history = model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.evaluate(test_dataloader)



[0.3680238425731659, 0.8770961165428162]

## BCResNet2TFLite

In [27]:
test_ds = SpeechCommands12('/content/GSC_12_test', download = False, subset = 'official_testing', batch_size = 1, shuffle = False, transform = lambda x, _: wav_pad(x, SR))

In [28]:
from tqdm import tqdm

test_specs = []
test_labels = []

for x, y in test_ds:
    test_specs.append(x)
    test_labels.append(y)

test_specs = tf.squeeze(tf.convert_to_tensor(np.stack(test_specs, axis = 0)))
test_labels = tf.squeeze(tf.convert_to_tensor(np.stack(test_labels)))

In [29]:
test_specs.shape

TensorShape([4890, 16000])

In [30]:
test_labels.shape

TensorShape([4890])

In [31]:
def representative_data_gen():
    for input_value in tf.data.Dataset.from_tensor_slices(test_specs).batch(1).take(100):
        yield [input_value]

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_data_gen
# Ensure that if any ops can't be quantized, the converter throws an error
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
# Set the input and output tensors to uint 8 (APIs added in r2.3)
#converter.inference_input_type = tf.uint8
#converter.inference_output_type = tf.uint8

tflite_model_quant = converter.convert()



In [32]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

In [33]:
import pathlib

tflite_models_dir = pathlib.Path("/tmp/gsc_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

# Save the dynamic range quantized model:
tflite_model_quant_file = tflite_models_dir/"gsc_tflite_backup_model_quant.tflite"
tflite_model_quant_file.write_bytes(tflite_model_quant)

113592

In [34]:
tflite_model_file = tflite_models_dir/"gsc_model_backup.tflite"
tflite_model_file.write_bytes(tflite_model)

383828

In [35]:
# Helper function to run inference on a TFLite model
def run_tflite_model(tflite_file, test_image_indices):
    global test_specs

    # Initialize the interpreter
    interpreter = tf.lite.Interpreter(model_path = str(tflite_file))
    interpreter.allocate_tensors()

    input_details = interpreter.get_input_details()[0]
    output_details = interpreter.get_output_details()[0]

    predictions = np.zeros((len(test_image_indices), ), dtype = int)
    for i, test_image_index in enumerate(test_image_indices):
        test_image = test_specs[test_image_index]

        # Check if the input type is quantized, the rescale input data to to uint8
        if input_details['dtype'] == np.uint8:
            input_scale, input_zero_point = input_details['quantization']
            test_image = test_image/input_scale + input_zero_point

        test_image = np.expand_dims(test_image, axis = 0).astype(input_details['dtype'])
        interpreter.set_tensor(input_details['index'], test_image)
        interpreter.invoke()
        output = interpreter.get_tensor(output_details['index'])[0]

        predictions[i] = output.argmax()

    return predictions

# Helper function to evaluate a TFLite model on all images
def evaluate_model(tflite_file, model_type):
  global test_specs
  global test_labels

  test_image_indices = range(test_specs.shape[0])
  predictions = run_tflite_model(tflite_file, test_image_indices)

  accuracy = (np.sum(test_labels== predictions) * 100) / len(test_specs)

  print('%s model accuracy is %.4f%% (Number of test samples=%d)' % (
      model_type, accuracy, len(test_specs)))

In [36]:
evaluate_model(tflite_model_quant_file, model_type="Quantized")

Quantized model accuracy is 60.8384% (Number of test samples=4890)


In [None]:
evaluate_model(tflite_model_file, model_type="Float32")

In [None]:
import shutil
shutil.copy2(str(tflite_model_quant_file), '/content')

'/content/gsc_tflite_backup_model_quant.tflite'

In [None]:
!xxd -i /content/gsc_tflite_model_quant.tflite > backup_model_data.cc

xxd: /content/gsc_tflite_model_quant.tflite: No such file or directory
