In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow_io

Collecting tensorflow_io
  Downloading tensorflow_io-0.37.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow_io
Successfully installed tensorflow_io-0.37.0


In [3]:
try:
  from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op  # pylint:disable=g-import-not-at-top
except ImportError:
  frontend_op = None

In [4]:
from typing import Tuple, Optional

import tensorflow as tf
from tensorflow import keras
import tensorflow_io as tfio

import numpy as np
import pandas as pd

import os
import sys
import tarfile
import hashlib
import re
import glob

import random
import math

import IPython.display as ipd
from tensorflow.python.util import compat

In [11]:
LIB_PATH = '/content/drive/MyDrive/GSC/GSC_helper'
sys.path.append(LIB_PATH)
from utils import _download, unzipzip

## Alexa

In [6]:
import gdown

In [7]:
def get_id(drive_url: str) -> str:
    return re.split('/', drive_url)[5]

In [9]:
ALEXA_PATH = 'https://drive.google.com/file/d/1E2DSial2FtrvLeESGsISFVnG0L50lwCr/view?usp=drive_link'

In [12]:
path = '/content/Alexa/alexa.zip'
gdown.download(f"https://drive.google.com/uc?id={get_id(ALEXA_PATH)}", path, quiet = False)

Downloading...
From (original): https://drive.google.com/uc?id=1E2DSial2FtrvLeESGsISFVnG0L50lwCr
From (redirected): https://drive.google.com/uc?id=1E2DSial2FtrvLeESGsISFVnG0L50lwCr&confirm=t&uuid=470d33b3-2cd9-4bae-a986-fa7d462cdbf8
To: /content/Alexa/alexa.zip
100%|██████████| 30.0M/30.0M [00:01<00:00, 26.9MB/s]


'/content/Alexa/alexa.zip'

In [13]:
unzipzip('/content/Alexa/alexa.zip', '/content/Alexa')

Extracted /content/Alexa/alexa.zip


In [19]:
import shutil

In [21]:
for f in os.walk('/content/Alexa/alexa/alexa'):
    wav_root = f[0]
    wav_files = f[2]
    _, idx_name = os.path.split(wav_root)
    if len(wav_files) == 0:
        continue
    for wav_file in wav_files:
        shutil.copy2(os.path.join(wav_root, wav_file), os.path.join('/content/All_Alexa', idx_name +'_' + wav_file))

## GSC

In [22]:
DATA_URL = ['http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
            'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz']

OFFICIAL_TEST_URL = ['http://download.tensorflow.org/data/speech_commands_test_set_v0.01.tar.gz',
                     'http://download.tensorflow.org/data/speech_commands_test_set_v0.02.tar.gz']

WORDS = ['down', 'left', 'off', 'on', 'right', 'up']

MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
RANDOM_SEED = 59185
SR = 16000

def prepare_words_list(wanted_words: list) -> list:
    """Prepends common tokens to the custom word list.

    Args:
        wanted_words: List of strings containing the custom words.

    Returns:
        List with the standard silence and unknown tokens added.
    """
    return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words

def which_set(filename: str,
              validation_percentage: int,
              testing_percentage: int) -> str:
    """Determines which data partition the file should belong to.

    We want to keep files in the same training, validation, or testing sets even
    if new ones are added over time. This makes it less likely that testing
    samples will accidentally be reused in training when long runs are restarted
    for example. To keep this stability, a hash of the filename is taken and used
    to determine which set it should belong to. This determination only depends on
    the name and the set proportions, so it won't change as other files are added.

    It's also useful to associate particular files as related (for example words
    spoken by the same person), so anything after '_nohash_' in a filename is
    ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
    'bobby_nohash_1.wav' are always in the same set, for example.

    Args:
        filename: File path of the data sample.
        validation_percentage: How much of the data set to use for validation.
        testing_percentage: How much of the data set to use for testing.

    Returns:
        String, one of 'training', 'validation', or 'testing'.
    """
    base_name = os.path.basename(filename)
    # We want to ignore anything after '_nohash_' in the file name when
    # deciding which set to put a wav in, so the data set creator has a way of
    # grouping wavs that are close variations of each other.
    hash_name = re.sub(r'_nohash_.*$', '', base_name)
    # This looks a bit magical, but we need to decide whether this file should
    # go into the training, testing, or validation sets, and we want to keep
    # existing files in the same set even if more files are subsequently
    # added.
    # To do that, we need a stable way of deciding based on just the file name
    # itself, so we do a hash of that and then use that to generate a
    # probability value that we use to assign it.
    hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
    percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                      (100.0 / MAX_NUM_WAVS_PER_CLASS))
    if percentage_hash < validation_percentage:
        result = 'validation'
    elif percentage_hash < (testing_percentage + validation_percentage):
        result = 'testing'
    else:
        result = 'training'
    return result

def prepare_data_index(data_dir: list,
                       silence_percentage: int,
                       unknown_percentage: int,
                       wanted_words:int,
                       validation_percentage: int,
                       testing_percentage: int) -> Tuple[dict, dict]:
    """Prepares a list of the samples organized by set and label.

    The training loop needs a list of all the available data, organized by
    which partition it should belong to, and with ground truth labels attached.
    This function analyzes the folders below the `data_dir`, figures out the
    right
    labels for each file based on the name of the subdirectory it belongs to,
    and uses a stable hash to assign it to a data set partition.

    Args:
      silence_percentage: How much of the resulting data should be background.
      unknown_percentage: How much should be audio outside the wanted classes.
      wanted_words: Labels of the classes we want to be able to recognize.
      validation_percentage: How much of the data set to use for validation.
      testing_percentage: How much of the data set to use for testing.

    Returns:
      Dictionary containing a list of file information for each set partition,
      and a lookup map for each class to determine its numeric index.

    Raises:
      Exception: If expected files are not found.
    """
    # Make sure the shuffling and picking of unknowns is deterministic.
    random.seed(RANDOM_SEED)
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
        wanted_words_index[wanted_word] = index + 2

    data_index = {'validation': [], 'testing': [], 'training': []}
    unknown_index = {'validation': [], 'testing': [], 'training': []}
    all_words = {}

    # Look through all the subfolders to find audio samples
    search_path = glob.glob(os.path.join(data_dir, '*', '*.wav'))
    for wav_path in search_path:
        _, word = os.path.split(os.path.dirname(wav_path))
        word = word.lower()
        # Treat the '_background_noise_' folder as a special case, since we expect
        # it to contain long audio samples we mix in to improve training.
        if word == BACKGROUND_NOISE_DIR_NAME:
            continue
        all_words[word] = True
        set_index = which_set(wav_path, validation_percentage, testing_percentage)
        # If it's a known class, store its detail, otherwise add it to the list
        # we'll use to train the unknown label.
        if word in wanted_words_index:
            data_index[set_index].append({'label': word, 'file': wav_path})
        else:
            unknown_index[set_index].append({'label': word, 'file': wav_path})

    if not all_words:
        raise Exception('No .wavs found at ' + search_path)

    for index, wanted_word in enumerate(wanted_words):
        if wanted_word not in all_words:
            raise Exception('Expected to find ' + wanted_word +
                        ' in labels but only found ' +
                        ', '.join(all_words.keys()))

    # We need an arbitrary file to load as the input for the silence samples.
    # It's multiplied by zero later, so the content doesn't matter.
    silence_wav_path = data_index['training'][0]['file']
    for set_index in ['validation', 'testing', 'training']:
        set_size = len(data_index[set_index])
        silence_size = int(math.ceil(set_size * silence_percentage / 100))
        for _ in range(silence_size):
            data_index[set_index].append({
                'label': SILENCE_LABEL,
              'file': silence_wav_path
            })

      # Pick some unknowns to add to each partition of the data set.
        random.shuffle(unknown_index[set_index])
        unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
        data_index[set_index].extend(unknown_index[set_index][:unknown_size])

    # Make sure the ordering is random.
    for set_index in ['validation', 'testing', 'training']:
        random.shuffle(data_index[set_index])

    # Prepare the rest of the result data structure.
    words_list = prepare_words_list(wanted_words)
    word_to_index = {}
    for word in all_words:
        if word in wanted_words_index:
            word_to_index[word] = wanted_words_index[word]
        else:
            word_to_index[word] = UNKNOWN_WORD_INDEX
    word_to_index[SILENCE_LABEL] = SILENCE_INDEX

    return data_index, word_to_index

def prepare_official_test(data_dir: str,
                          wanted_words: list) -> Tuple[list, dict]:
    """
    In case of using the companion for evaluation. We also need to prepare it like we did which makes sure that
    everything will be synchronized.
    Args:
    data_dir: str
        Data directory
    wanted_words: list
    """
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
        wanted_words_index[wanted_word] = index + 2
    wanted_words_index[SILENCE_LABEL] = SILENCE_INDEX
    wanted_words_index[UNKNOWN_WORD_LABEL] = UNKNOWN_WORD_INDEX

    test_data = []

    search_path = glob.glob(os.path.join(data_dir, '*', '*.wav'))
    for wav_path in search_path:
        _, word = os.path.split(os.path.dirname(wav_path))
        word = word.lower()
        test_data.append({'label': word, 'file': wav_path})

    return test_data, wanted_words_index

In [23]:
root = './GSC_8'
version = 2
url = DATA_URL[version-1]
filename = os.path.split(url)[-1]
print('>> Downloading %s' % filename)
_download(url, root)

>> Downloading speech_commands_v0.02.tar.gz


100%|██████████| 2.26G/2.26G [00:27<00:00, 87.2MB/s]


In [24]:
os.mkdir('/content/GSC_8/alexa')
for f in os.walk('/content/Alexa/alexa/alexa'):
    wav_root = f[0]
    wav_files = f[2]
    _, idx_name = os.path.split(wav_root)
    if len(wav_files) == 0:
        continue
    for wav_file in wav_files:
        shutil.copy2(os.path.join(wav_root, wav_file), os.path.join('/content/GSC_8/alexa', idx_name +'_nohash_' + wav_file))

In [33]:
data_index, word_to_index = prepare_data_index(root,
                                               silence_percentage = 20,
                                               unknown_percentage = 20,
                                               wanted_words = ['alexa'],
                                               validation_percentage = 10,
                                               testing_percentage = 10)

In [37]:
len(data_index['training'])

403

In [35]:
def wav_pad(waveform, sr):
    length = len(waveform)
    if length < sr:
        buff = np.zeros(sr)
        buff[:length] = waveform
    elif length > sr:
        buff = waveform[:sr]
    else:
        return waveform
    return buff

def normalizeNoise(wav,
                   noise,
                   max_length: int = 16000):
    len_wav = len(wav)
    len_noise = len(noise)
    if len_wav > len_noise:
        buf = np.zeros_like(wav)
        start_point = int((len_wav - len_noise)*random.uniform(0, 1))
        end_point = start_point + len_noise
        buf[start_point: end_point] = noise
        noise = buf
    elif len_wav < len_noise:
        start_point = int((len_noise - len_wav)*random.uniform(0, 1))
        end_point = start_point + len_wav
        noise = noise[start_point: end_point]
    return noise

def add_noise(waveform, noise):
    noise = normalizeNoise(waveform, noise)
    p = random.uniform(0, 0.5)
    return np.clip(waveform + p*noise, -1, 1)

def time_shift(wav,
               shift: list,
               sr: int = 16000):
    x_shift = int(random.uniform(*shift)*sr)
    padding = np.zeros(np.abs(x_shift))
    if x_shift < 0:
        wav = np.concatenate([padding, wav[:x_shift]], axis = 0)
    else:
        wav = np.concatenate([wav[x_shift:], padding], axis = 0)
    return wav

class Preprocessing:
    def __init__(self,
                 noise_dir: str,
                 noise_prob: float,
                 shift: list = None,
                 is_train: bool = False,
                 augment: bool = True,
                 transform = None) -> None:
        self.noise_paths = glob.glob(os.path.join(noise_dir, '*.wav'))
        self.is_train = is_train
        self.noise_prob = noise_prob
        self.augment = augment
        self.transform = transform
        self.add_noise = lambda x, noise: add_noise(x, noise)
        self.pad_trunc = lambda x: wav_pad(x, SR)
        self.shift = shift
        if shift:
            self.time_shift = lambda x: time_shift(x, shift)

    def __call__(self,
                 wav,
                 label: str):
        # padding to SR
        wav = self.pad_trunc(wav)

        if self.augment:
            # time shifting for training
            if self.is_train:
                if self.shift:
                    wav = self.time_shift(wav)

            p = random.random()
            if label == SILENCE_LABEL or (self.is_train and p<= self.noise_prob):
                noise_contents = tf.io.read_file(random.choice(self.noise_paths))
                noise, _ = tf.audio.decode_wav(noise_contents,
                                               desired_channels = -1)
                noise = tf.squeeze(noise, axis = -1)
                if label == SILENCE_LABEL:
                    p = random.random()
                    wav = normalizeNoise(wav, noise*p)
                else:
                    wav = self.add_noise(wav, noise)

        if self.transform:
            wav = self.transform(wav)

        return wav

In [36]:
f_pre = lambda is_train, augment: Preprocessing(noise_dir = '/content/GSC_8/_background_noise_',
                                                noise_prob = 0.8,
                                                shift  = [-0.1, 0.1],
                                                is_train = is_train,
                                                augment = augment,
                                                )
train_pre = f_pre(True, True)
val_pre = f_pre(False, True)
test_pre = f_pre(False, False)

In [41]:
from tqdm import tqdm

def GSC_preprocessing(data_index, word_to_index, root, output_directory, num_classes = 12, transform = None,
                       mul_factor = 1, set = 'train', csv_file_name = 'analysised_spec.csv'):
    """
    Preprocessing for each dataset

    mul_factor: increasing the number of data samples by mul_factor times.
    """
    out_df = {
        'link': [],
        'label': [],
    }
    # data_df = pd.read_csv(csv_file)

    for idx in range(mul_factor):
        for ix in tqdm(range(len(data_index))):
            row = data_index[ix]
            label = word_to_index[row['label']]
            filepath = row['file']

            fname = f'{set}_{label}_{ix}_{idx}.npz'
            out_df['link'].append(os.path.join(set, fname))
            out_df['label'].append(label)

            if os.path.exists(os.path.join(output_directory, fname)):
                continue

            if label == SILENCE_INDEX:
                wav = tf.zeros((SR))
            else:
                file_contents = tf.io.read_file(row['file'])
                wav, sr = tf.audio.decode_wav(
                    file_contents,
                    desired_channels = 1
                )
                wav = tf.squeeze(wav, axis = -1)
            if transform:
                wav = transform(wav, label)
                wav = tf.cast(tf.multiply(wav, 32768), tf.int16)
                wav = frontend_op.audio_microfrontend(wav,                # 49, 40
                                                      sample_rate = 16000,
                                                      window_size = 30,
                                                      window_step = 20,
                                                      num_channels = 40,
                                                      out_type = tf.float32)
                wav = tf.multiply(wav, (10.0 / 256.0))
                wav = tf.expand_dims(wav, axis = 2)

            np.savez_compressed(os.path.join(output_directory, fname), wav.numpy())

    out_df = pd.DataFrame(out_df)
    out_df.to_csv(csv_file_name, index = False)

In [40]:
GSC_preprocessing(data_index['training'], word_to_index, '/content', '/content/train_2', transform = train_pre, set = 'train', num_classes = 3, csv_file_name = '/content/train_3_alexa.csv')

100%|██████████| 403/403 [00:00<00:00, 79037.90it/s]
100%|██████████| 403/403 [00:15<00:00, 25.35it/s]
100%|██████████| 403/403 [00:12<00:00, 32.05it/s]
100%|██████████| 403/403 [00:11<00:00, 34.99it/s]
100%|██████████| 403/403 [00:11<00:00, 35.70it/s]
100%|██████████| 403/403 [00:13<00:00, 30.90it/s]
100%|██████████| 403/403 [00:12<00:00, 32.05it/s]
100%|██████████| 403/403 [00:11<00:00, 34.50it/s]
100%|██████████| 403/403 [00:11<00:00, 35.80it/s]
100%|██████████| 403/403 [00:12<00:00, 31.27it/s]
100%|██████████| 403/403 [00:13<00:00, 29.95it/s]
100%|██████████| 403/403 [00:12<00:00, 31.06it/s]
100%|██████████| 403/403 [00:11<00:00, 35.09it/s]
100%|██████████| 403/403 [00:12<00:00, 31.77it/s]
100%|██████████| 403/403 [00:12<00:00, 32.18it/s]
100%|██████████| 403/403 [00:12<00:00, 32.20it/s]
100%|██████████| 403/403 [00:11<00:00, 36.36it/s]
100%|██████████| 403/403 [00:12<00:00, 31.89it/s]
100%|██████████| 403/403 [00:11<00:00, 34.29it/s]
100%|██████████| 403/403 [00:10<00:00, 37.26it/

In [42]:
GSC_preprocessing(data_index['validation'], word_to_index, '/content', '/content/val_2', transform = val_pre, set = 'val', num_classes = 3, csv_file_name = '/content/val_3_alexa.csv')

100%|██████████| 60/60 [00:00<00:00, 284.32it/s]


In [43]:
GSC_preprocessing(data_index['testing'], word_to_index, '/content', '/content/test_2', transform = val_pre, set = 'test', num_classes = 3, csv_file_name = '/content/test_8_lfbe.csv')

100%|██████████| 56/56 [00:00<00:00, 148.36it/s]


In [44]:
from GSC_zip import unzipzip, zipzip

In [45]:
zipzip('/content/train_2', '/content/drive/MyDrive/Dataset/train_3_alexa.zip')
zipzip('/content/val_2', '/content/drive/MyDrive/Dataset/val_3_alexa.zip')
zipzip('/content/test_2', '/content/drive/MyDrive/Dataset/test_3_alexa.zip')

zipping...: 100%|██████████| 12090/12090 [00:08<00:00, 1501.62it/s]


/content/drive/MyDrive/Dataset/train_3_alexa.zip created


zipping...: 100%|██████████| 60/60 [00:00<00:00, 1795.11it/s]


/content/drive/MyDrive/Dataset/val_3_alexa.zip created


zipping...: 100%|██████████| 56/56 [00:00<00:00, 1523.35it/s]

/content/drive/MyDrive/Dataset/test_3_alexa.zip created





In [46]:
import shutil

In [48]:
shutil.move('/content/test_3_alexa.csv', '/content/drive/MyDrive/Dataset')
shutil.move('/content/train_3_alexa.csv', '/content/drive/MyDrive/Dataset')
shutil.move('/content/val_3_alexa.csv', '/content/drive/MyDrive/Dataset')

'/content/drive/MyDrive/Dataset/val_3_alexa.csv'