In [1]:
import hashlib
import math
import os.path
import random
import re
import sys
import tarfile

import numpy as np
import urllib
import tensorflow as tf

from tensorflow.python.ops import gen_audio_ops as audio_ops
from tensorflow.python.ops import io_ops
from tensorflow.python.platform import gfile
from tensorflow.python.util import compat

In [2]:
DATA_URL = 'http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz'

In [9]:
def download(data_url, dest_directory):
    filename = os.path.split(data_url)[-1]
    def _progress(count, block_size, total_size):
        sys.stdout.write(
            '\r>> Downloading %s %.1f%%' %
            (filename, float(count * block_size) / float(total_size) * 100.0))
        sys.stdout.flush()

    if not os.path.exists(dest_directory):
        os.mkdir(dest_directory)
    filepath = os.path.join(dest_directory, filename)

    filepath, _ = urllib.request.urlretrieve(data_url, filepath, _progress)
    tarfile.open(filepath, 'r:gz').extractall(dest_directory)

In [10]:
download(DATA_URL, 'GSC_12')

>> Downloading speech_commands_v0.02.tar.gz 100.0%

In [3]:
WORDS = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']

In [4]:
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M
SILENCE_LABEL = '_silence_'
SILENCE_INDEX = 0
UNKNOWN_WORD_LABEL = '_unknown_'
UNKNOWN_WORD_INDEX = 1
BACKGROUND_NOISE_DIR_NAME = '_background_noise_'
RANDOM_SEED = 59185

In [5]:
def prepare_words_list(wanted_words):
  """Prepends common tokens to the custom word list.

  Args:
    wanted_words: List of strings containing the custom words.

  Returns:
    List with the standard silence and unknown tokens added.
  """
  return [SILENCE_LABEL, UNKNOWN_WORD_LABEL] + wanted_words

In [6]:
def which_set(filename, validation_percentage, testing_percentage):
  """Determines which data partition the file should belong to.

  We want to keep files in the same training, validation, or testing sets even
  if new ones are added over time. This makes it less likely that testing
  samples will accidentally be reused in training when long runs are restarted
  for example. To keep this stability, a hash of the filename is taken and used
  to determine which set it should belong to. This determination only depends on
  the name and the set proportions, so it won't change as other files are added.

  It's also useful to associate particular files as related (for example words
  spoken by the same person), so anything after '_nohash_' in a filename is
  ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
  'bobby_nohash_1.wav' are always in the same set, for example.

  Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.

  Returns:
    String, one of 'training', 'validation', or 'testing'.
  """
  base_name = os.path.basename(filename)
  # We want to ignore anything after '_nohash_' in the file name when
  # deciding which set to put a wav in, so the data set creator has a way of
  # grouping wavs that are close variations of each other.
  hash_name = re.sub(r'_nohash_.*$', '', base_name)
  # This looks a bit magical, but we need to decide whether this file should
  # go into the training, testing, or validation sets, and we want to keep
  # existing files in the same set even if more files are subsequently
  # added.
  # To do that, we need a stable way of deciding based on just the file name
  # itself, so we do a hash of that and then use that to generate a
  # probability value that we use to assign it.
  hash_name_hashed = hashlib.sha1(compat.as_bytes(hash_name)).hexdigest()
  percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result

In [7]:
def prepare_data_index(data_dir, silence_percentage, unknown_percentage,
                         wanted_words, validation_percentage,
                         testing_percentage):
    """Prepares a list of the samples organized by set and label.

    The training loop needs a list of all the available data, organized by
    which partition it should belong to, and with ground truth labels attached.
    This function analyzes the folders below the `data_dir`, figures out the
    right
    labels for each file based on the name of the subdirectory it belongs to,
    and uses a stable hash to assign it to a data set partition.

    Args:
      silence_percentage: How much of the resulting data should be background.
      unknown_percentage: How much should be audio outside the wanted classes.
      wanted_words: Labels of the classes we want to be able to recognize.
      validation_percentage: How much of the data set to use for validation.
      testing_percentage: How much of the data set to use for testing.

    Returns:
      Dictionary containing a list of file information for each set partition,
      and a lookup map for each class to determine its numeric index.

    Raises:
      Exception: If expected files are not found.
    """
    # Make sure the shuffling and picking of unknowns is deterministic.
    random.seed(RANDOM_SEED)
    wanted_words_index = {}
    for index, wanted_word in enumerate(wanted_words):
      wanted_words_index[wanted_word] = index + 2
    data_index = {'validation': [], 'testing': [], 'training': []}
    unknown_index = {'validation': [], 'testing': [], 'training': []}
    all_words = {}
    # Look through all the subfolders to find audio samples
    search_path = os.path.join(data_dir, '*', '*.wav')
    for wav_path in gfile.Glob(search_path):
      _, word = os.path.split(os.path.dirname(wav_path))
      word = word.lower()
      # Treat the '_background_noise_' folder as a special case, since we expect
      # it to contain long audio samples we mix in to improve training.
      if word == BACKGROUND_NOISE_DIR_NAME:
        continue
      all_words[word] = True
      set_index = which_set(wav_path, validation_percentage, testing_percentage)
      # If it's a known class, store its detail, otherwise add it to the list
      # we'll use to train the unknown label.
      if word in wanted_words_index:
        data_index[set_index].append({'label': word, 'file': wav_path})
      else:
        unknown_index[set_index].append({'label': word, 'file': wav_path})
    if not all_words:
      raise Exception('No .wavs found at ' + search_path)
    for index, wanted_word in enumerate(wanted_words):
      if wanted_word not in all_words:
        raise Exception('Expected to find ' + wanted_word +
                        ' in labels but only found ' +
                        ', '.join(all_words.keys()))
    # We need an arbitrary file to load as the input for the silence samples.
    # It's multiplied by zero later, so the content doesn't matter.
    silence_wav_path = data_index['training'][0]['file']
    for set_index in ['validation', 'testing', 'training']:
      set_size = len(data_index[set_index])
      silence_size = int(math.ceil(set_size * silence_percentage / 100))
      for _ in range(silence_size):
        data_index[set_index].append({
            'label': SILENCE_LABEL,
            'file': silence_wav_path
        })
      # Pick some unknowns to add to each partition of the data set.
      random.shuffle(unknown_index[set_index])
      unknown_size = int(math.ceil(set_size * unknown_percentage / 100))
      data_index[set_index].extend(unknown_index[set_index][:unknown_size])
    # Make sure the ordering is random.
    for set_index in ['validation', 'testing', 'training']:
      random.shuffle(data_index[set_index])
    # Prepare the rest of the result data structure.
    words_list = prepare_words_list(wanted_words)
    word_to_index = {}
    for word in all_words:
      if word in wanted_words_index:
        word_to_index[word] = wanted_words_index[word]
      else:
        word_to_index[word] = UNKNOWN_WORD_INDEX
    word_to_index[SILENCE_LABEL] = SILENCE_INDEX
    return wanted_words_index, data_index, unknown_index, all_words, words_list, word_to_index

In [19]:
wanted_words_index, data_index, unknown_index, all_words, words_list, word_to_index = prepare_data_index('/content/GSC_12',
                                                                                                         10,
                                                                                                         10,
                                                                                                         WORDS,
                                                                                                         10,
                                                                                                         10)

In [None]:
wanted_words_index

{'down': 2,
 'go': 3,
 'left': 4,
 'no': 5,
 'off': 6,
 'on': 7,
 'right': 8,
 'stop': 9,
 'up': 10,
 'yes': 11}

In [None]:
train = data_index['training']
len(train)

36923

In [None]:
train

[{'label': 'up', 'file': '/content/GSC_12/up/e9287461_nohash_1.wav'},
 {'label': 'yes', 'file': '/content/GSC_12/yes/2aec99ec_nohash_0.wav'},
 {'label': 'zero', 'file': '/content/GSC_12/zero/5ebc1cda_nohash_3.wav'},
 {'label': '_silence_', 'file': '/content/GSC_12/on/c79159aa_nohash_4.wav'},
 {'label': 'no', 'file': '/content/GSC_12/no/845f8553_nohash_2.wav'},
 {'label': 'right', 'file': '/content/GSC_12/right/f8ad3941_nohash_0.wav'},
 {'label': 'no', 'file': '/content/GSC_12/no/a7200079_nohash_4.wav'},
 {'label': 'no', 'file': '/content/GSC_12/no/14c7b073_nohash_0.wav'},
 {'label': 'right', 'file': '/content/GSC_12/right/5aac2efa_nohash_1.wav'},
 {'label': 'up', 'file': '/content/GSC_12/up/c578beb0_nohash_0.wav'},
 {'label': 'down', 'file': '/content/GSC_12/down/fda46b78_nohash_1.wav'},
 {'label': 'off', 'file': '/content/GSC_12/off/7d149b38_nohash_2.wav'},
 {'label': 'on', 'file': '/content/GSC_12/on/b31ad508_nohash_0.wav'},
 {'label': 'zero', 'file': '/content/GSC_12/zero/51055bda_n

In [None]:
train[1]

{'label': 'yes', 'file': '/content/GSC_12/yes/2aec99ec_nohash_0.wav'}

In [None]:
validation = data_index['validation']
len(validation)

4445

In [21]:
test = data_index['testing']
len(test)

4890

In [32]:
test

[{'label': 'go', 'file': '/content/GSC_12/go/aa80f517_nohash_3.wav'},
 {'label': 'stop', 'file': '/content/GSC_12/stop/db24628d_nohash_2.wav'},
 {'label': 'right', 'file': '/content/GSC_12/right/422d3197_nohash_0.wav'},
 {'label': 'go', 'file': '/content/GSC_12/go/5c8af87a_nohash_2.wav'},
 {'label': '_silence_', 'file': '/content/GSC_12/on/c79159aa_nohash_4.wav'},
 {'label': 'up', 'file': '/content/GSC_12/up/aa80f517_nohash_0.wav'},
 {'label': 'down', 'file': '/content/GSC_12/down/db24628d_nohash_0.wav'},
 {'label': 'off', 'file': '/content/GSC_12/off/8c7f81df_nohash_1.wav'},
 {'label': 'zero', 'file': '/content/GSC_12/zero/e49428d9_nohash_3.wav'},
 {'label': 'off', 'file': '/content/GSC_12/off/fdb5155e_nohash_0.wav'},
 {'label': '_silence_', 'file': '/content/GSC_12/on/c79159aa_nohash_4.wav'},
 {'label': 'up', 'file': '/content/GSC_12/up/d962e5ac_nohash_0.wav'},
 {'label': 'five', 'file': '/content/GSC_12/five/1b4c9b89_nohash_3.wav'},
 {'label': 'stop', 'file': '/content/GSC_12/stop/5

In [22]:
i = 0
for data in test:
    if data['label'] == '_silence_':
        i+=1
        sys.stdout.write(f'\r {i}')

 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 2

In [28]:
for data in test:
    if data['file'] == '/content/GSC_12/backward/0cb74144_nohash_2.wav':
        print('i lov3 u')
        break

In [29]:
import IPython.display as ipd

In [31]:
ipd.Audio('/content/GSC_12/backward/0cb74144_nohash_2.wav', rate = 16000)

In [None]:
train_unknown = unknown_index['training']
len(train_unknown)

54074

In [None]:
all_words

{'house': True,
 'bed': True,
 'four': True,
 'three': True,
 'two': True,
 'wow': True,
 'learn': True,
 'on': True,
 'off': True,
 'right': True,
 'yes': True,
 'sheila': True,
 'visual': True,
 'no': True,
 'one': True,
 'down': True,
 'cat': True,
 'dog': True,
 'nine': True,
 'marvin': True,
 'up': True,
 'zero': True,
 'five': True,
 'bird': True,
 'backward': True,
 'go': True,
 'happy': True,
 'stop': True,
 'seven': True,
 'forward': True,
 'six': True,
 'eight': True,
 'follow': True,
 'tree': True,
 'left': True}

In [None]:
words_list

['_silence_',
 '_unknown_',
 'down',
 'go',
 'left',
 'no',
 'off',
 'on',
 'right',
 'stop',
 'up',
 'yes']

In [None]:
len(words_list)

12

In [None]:
word_to_index

{'house': 1,
 'bed': 1,
 'four': 1,
 'three': 1,
 'two': 1,
 'wow': 1,
 'learn': 1,
 'on': 7,
 'off': 6,
 'right': 8,
 'yes': 11,
 'sheila': 1,
 'visual': 1,
 'no': 5,
 'one': 1,
 'down': 2,
 'cat': 1,
 'dog': 1,
 'nine': 1,
 'marvin': 1,
 'up': 10,
 'zero': 1,
 'five': 1,
 'bird': 1,
 'backward': 1,
 'go': 3,
 'happy': 1,
 'stop': 9,
 'seven': 1,
 'forward': 1,
 'six': 1,
 'eight': 1,
 'follow': 1,
 'tree': 1,
 'left': 4,
 '_silence_': 0}