##### Adapted from https://github.com/ShawnHymel/tflite-speech-recognition


##Environment setup


In [29]:
!pip install python_speech_features
!git clone https://github.com/AllenDowney/ThinkDSP.git 
!pip install playsound

fatal: destination path 'ThinkDSP' already exists and is not an empty directory.


##Mounting access to google drive

In [30]:
from google.colab import drive
drive.mount('/content/drive')
data_path='/content/drive/MyDrive/Deep Learning/Datasets/speech commands'
#colab specific

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Audio dataset at https://ai.googleblog.com/2017/08/launching-speech-commands-dataset.html

##Imports

In [31]:
# necessary imports
from os import listdir
from os.path import isdir, join
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features
import sys
sys.path.insert(0, 'ThinkDSP/code/') 
import thinkdsp
import IPython

##Discover classes and samples per class

In [32]:
# validate date path
for name in listdir(data_path):
  if isdir(join(data_path,name)):
    print(name)

go
_background_noise_
stop
four
five
down
left
eight
zero
no
three


In [33]:
# create target list
targets=[name for name in listdir(data_path) if isdir(join(data_path,name))]
print(targets)

['go', '_background_noise_', 'stop', 'four', 'five', 'down', 'left', 'eight', 'zero', 'no', 'three']


In [34]:
# number of samples per category
num_samples = 0
for t in targets:
  x= len(listdir(join(data_path,t)))
  print(t, x)
  num_samples+=x
print("total samples: ",num_samples)

go 2372
_background_noise_ 7
stop 2390
four 2372
five 2357
down 2359
left 2353
eight 2352
zero 1884
no 0
three 0
total samples:  18446


##Exclude classes

In [35]:
# removing any category
excluded = ['tree','three','no']
for item in excluded:
  if item in targets:
    targets.remove(item)

print(targets)

['go', '_background_noise_', 'stop', 'four', 'five', 'down', 'left', 'eight', 'zero']


In [36]:
# settings
target_list = targets
features_file = '/content/drive/MyDrive/Deep Learning/all_targets_mfcc.npz'
percent_keep = 0.1 #1.0 to keep everything (lower is faster for prototyping)
val_ratio = 0.1
test_ratio = 0.1
sample_rate = 8000
num_mfcc = 16
len_mfcc = 16

In [37]:
# make list of file names and labels
filenames = []
labels = []
for i, t in enumerate(targets):
  x = join(data_path,t)
  print (x)
  filenames.append(listdir(x))
  labels.append(np.ones(len(filenames[i]))*i)

/content/drive/MyDrive/Deep Learning/Datasets/speech commands/go
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/_background_noise_
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/stop
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/four
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/five
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/down
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/left
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/eight
/content/drive/MyDrive/Deep Learning/Datasets/speech commands/zero


In [38]:
# check labels
for i in labels:
  print(len(i))
print(labels)

2372
7
2390
2372
2357
2359
2353
2352
1884
[array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., 1., 1., 1., 1.]), array([2., 2., 2., ..., 2., 2., 2.]), array([3., 3., 3., ..., 3., 3., 3.]), array([4., 4., 4., ..., 4., 4., 4.]), array([5., 5., 5., ..., 5., 5., 5.]), array([6., 6., 6., ..., 6., 6., 6.]), array([7., 7., 7., ..., 7., 7., 7.]), array([8., 8., 8., ..., 8., 8., 8.])]


In [39]:
# flatten list if normal list
def flatten(listin = []):
  r=[]
  if not isinstance(listin,list):
    return [listin]
  if len(listin)<1:
    return r
  return r + flatten(listin[0])+flatten(listin[1:])

In [40]:
# flatten numpy lists
def flattennp (listin=[]):
  items = [item for sublist in listin for item in sublist]
  return items

In [41]:
# flatten lists of names and labels
filenames = flattennp(filenames)
labels = flattennp(labels)

In [42]:
# files and labels associated together
data_labels = list(zip(filenames, labels))
random.shuffle(data_labels)
filenames,labels = zip(*data_labels)

In [43]:
# keep percentage of samples
n = len(filenames)
print (n)
filenames = filenames[:int(n*percent_keep)]
labels = labels [:int(n*percent_keep)]#####
print (len(filenames))

18446
1844


In [44]:
# validation and test set sizes
val_size = int (len(filenames) * val_ratio)
test_size = int (len(filenames) * test_ratio)

In [45]:
# breakdown the dataset into train, test, and validate
filenames_val = filenames[:val_size]
y_val = labels[:val_size]

filenames_test = filenames[val_size:(val_size + test_size)]
y_test = labels [val_size:(val_size + test_size)]

filenames_train = filenames[(val_size+test_size):]
y_train = labels[(val_size+test_size):]

print(len(filenames_val),len(y_val))
print(len(filenames_test),len(y_test))
print(len(filenames_train),len(y_train))

184 184
184 184
1476 1476


In [46]:
# calculate MFCC
def calc_mfcc (path):
  signal, fs = librosa.load(path, sr = sample_rate)

  mfccs = python_speech_features.base.mfcc(signal,
                                           samplerate=fs,
                                           winlen = 0.256,
                                           winstep=0.05,
                                           numcep=num_mfcc,
                                           nfilt=26,
                                           nfft=2048,
                                           preemph=0.0,               #
                                           ceplifter=0,               #
                                           appendEnergy=False,        #
                                           winfunc=np.hanning)
  #print(fs) 
  return mfccs.transpose()

In [47]:
# remove faulty samples
def extract(files_in,y_in):
  prob_cnt = 0
  x_t = []
  y_t = []
  for index, filename in enumerate(files_in):
  #create path from filename and target item
    path = join (data_path,
                target_list[int(y_in[index])],
                filename)
    
    if not path.endswith('.wav'):
      continue

  # create mfcc
    mfcc = calc_mfcc(path)
    if mfcc.shape[1]==len_mfcc:
      x_t.append(mfcc)
      y_t.append(y_in[index])
    else:
      print('Dropped:', index, mfcc.shape)
      prob_cnt+=1
  return x_t , y_t , prob_cnt

In [48]:
# create valid test sets
x_tr, y_tr, p = extract(filenames_train, y_train)
print('Removed', 100*(p/len(y_train)),'%')
x_va, y_va, p = extract(filenames_val, y_val)
print('Removed', 100*(p/len(y_val)),'%')
x_te, y_te, p = extract(filenames_test, y_test)
print('Removed', 100*(p/len(y_test)),'%')

Dropped: 4 (16, 13)
Dropped: 8 (16, 14)
Dropped: 18 (16, 13)
Dropped: 34 (16, 14)
Dropped: 37 (16, 13)
Dropped: 82 (16, 13)
Dropped: 88 (16, 13)
Dropped: 135 (16, 10)
Dropped: 148 (16, 11)
Dropped: 181 (16, 15)
Dropped: 241 (16, 12)
Dropped: 248 (16, 14)
Dropped: 256 (16, 13)
Dropped: 273 (16, 13)
Dropped: 306 (16, 13)
Dropped: 313 (16, 15)
Dropped: 317 (16, 10)
Dropped: 342 (16, 15)
Dropped: 347 (16, 14)
Dropped: 402 (16, 11)
Dropped: 436 (16, 15)
Dropped: 458 (16, 14)
Dropped: 475 (16, 15)
Dropped: 483 (16, 11)
Dropped: 486 (16, 14)
Dropped: 487 (16, 14)
Dropped: 569 (16, 8)
Dropped: 576 (16, 14)
Dropped: 588 (16, 13)
Dropped: 591 (16, 10)
Dropped: 598 (16, 14)
Dropped: 600 (16, 14)
Dropped: 619 (16, 15)
Dropped: 622 (16, 13)
Dropped: 647 (16, 13)
Dropped: 666 (16, 14)
Dropped: 669 (16, 12)
Dropped: 683 (16, 13)
Dropped: 695 (16, 9)
Dropped: 764 (16, 13)
Dropped: 768 (16, 8)
Dropped: 771 (16, 13)
Dropped: 772 (16, 14)
Dropped: 777 (16, 12)
Dropped: 781 (16, 14)
Dropped: 784 (16, 12)


In [49]:
# Save features for later use
np.savez(features_file,
         x_tr = x_tr,
         y_tr = y_tr,
         x_va = x_va,
         y_va = y_va,
         x_te = x_te,
         y_te = y_te)