# Keyword speech recongnition

The idea is to use Convolution Neural Network to extract features of the framed specturm and classify the audio data.


## 1. Preprocessing

In [7]:
%matplotlib inline
import os
import re
from glob import glob

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

from scipy.io import wavfile
import scipy
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import IPython.display as ipd

import tensorflow as tf


from sklearn.model_selection import train_test_split
# from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization

from keras.optimizers import Adam
from scipy import signal

sns.set(style="whitegrid", color_codes=True)
init_notebook_mode(connected=True)

Using TensorFlow backend.


In [11]:
DATA_PATH = '/audio_sample' # path to training data
TEMP_DATA_PATH = '/output/temp'

In [12]:
os.listdir(DATA_PATH)

['audio',
 'README.md',
 '.DS_Store',
 'testing_list.txt',
 'LICENSE',
 'validation_list.txt']

In [10]:
folders = []
TRAIN_DATA_PATH = DATA_PATH+ '/audio/'
for file_folder in os.listdir(TRAIN_DATA_PATH):
    if os.path.isdir(TRAIN_DATA_PATH + file_folder):
        folders.append(file_folder)
print(folders)
print('There are totally ' + str(len(folders)) + ' labels in the training dataset.')

['no', 'one', 'on', 'marvin', 'nine', 'eight', '_background_noise_', 'bed', 'sheila', 'up', 'right', 'wow', 'four', 'cat', 'stop', 'zero', 'six', 'dog', 'three', 'off', 'five', 'down', 'go', 'happy', 'two', 'tree', 'seven', 'house', 'bird', 'yes', 'left']
There are totally 31 labels in the training dataset.


However, from the competition note, there are only 12 labels we have to pay attention to.
> **Note**: There are only 12 possible labels for the Test set: yes, no, up, down, left, right, on, off, stop, go, silence, unknown.

> The unknown label should be used for a command that is not one one of the first 10 labels or that is not silence.

In [43]:
POSSIBLE_LABELS = 'yes, no, up, down, left, right, on, off, stop, go, silence, unknown'.replace(' ', '').split(',')
print(POSSIBLE_LABELS)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']


 The file *validation_list.txt* specifies the data for validation.

In [44]:
VALIDATION_LIST_FILE_PATH = DATA_PATH + '/validation_list.txt'
with open(VALIDATION_LIST_FILE_PATH, 'r') as file:
    VALIDATION_FILE_NAMES = [line.rstrip() for line in file]

TEST_LIST_FILE_PATH = DATA_PATH + '/testing_list.txt'
with open(TEST_LIST_FILE_PATH, 'r') as file:
    TEST_FILE_NAMES = [line.rstrip() for line in file]


In [45]:
sample_matcher = re.compile("(?:.+\/)?(\w+)\/([^_]+)_.+wav")
def get_info_from_path(path):
    """
        return (label, id)
    """
    r = sample_matcher.match(path)
    if not r:
        raise ValueError(path + ' is not valid file path')
    label = r.group(1)
    id = r.group(2)
    
    if label not in POSSIBLE_LABELS:
        if label == '_background_noise_':
            label = 'silence'
        else:
            label = 'unknown'
    return label, id

In [46]:
VALIDATION_FILES = []
VALIDATIN_ID_SET = {}
for file in VALIDATION_FILE_NAMES:
    try: 
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    VALIDATION_FILES.append((label, id, file))
    VALIDATIN_ID_SET[id] = True

TEST_FILES = []
TEST_ID_SET = {}
for file in TEST_FILE_NAMES:
    try: 
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    TEST_FILES.append((label, id, file))
    TEST_ID_SET[id] = True

In [53]:
# train file is the file that neither in test or validation
file_path_matcher = re.compile("(.+\/)?(\w+\/[^_]+_.+wav)")
TRAIN_FILES = []
SAMPLE_FILES = glob(os.path.join(DATA_PATH, 'audio/*/*wav'))
for file in SAMPLE_FILES:
    try:
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    if (id not in VALIDATIN_ID_SET) and (id not in TEST_ID_SET):
        TRAIN_FILES.append((label, id, file_path_matcher.match(file).group(2)))

In [48]:
print('Number of sample')
print('Total: %d' % len(SAMPLE_FILES))
print('Train: %d' % len(TRAIN_FILES))
print('Validation: %d' % len(VALIDATION_FILES))
print('Test: %d' % len(TEST_FILES))
number_sum = len(TRAIN_FILES) + len(VALIDATION_FILES) + len(TEST_FILES)
print('{} + {} + {} = {}'.format(len(TRAIN_FILES), len(VALIDATION_FILES), len(TEST_FILES), number_sum))

Number of sample
Total: 64727
Train: 51094
Validation: 6798
Test: 6835
51094 + 6798 + 6835 = 64727


### look at the data

#### a. distribution

In [13]:
distribution = {}
for label, _, _ in TRAIN_FILES + VALIDATION_FILES:
    if label not in distribution:
        distribution[label] = 0
    distribution[label] = distribution[label] + 1

data = [go.Bar(
            x=[*distribution.keys()],
            y=[*distribution.values()]
    )]

iplot(data, filename='distribution')

When we look at the distribution of the data, **unknwon**, **silence** are two special labels. **unknown** has much more data than others, **silence** only has 6.


Ideas to handle the two special cases:
- train a model to seperate **silence** from the rest. Then we train another one to seperate **unknown** from the key words that we are interesting.

- bootstrap the other types

- throw away some of unknown words

In [14]:
distribution = {}
for label, _, _ in VALIDATION_FILES:
    if label not in distribution:
        distribution[label] = 0
    distribution[label] = distribution[label] + 1

data = [go.Bar(
            x=[*distribution.keys()],
            y=[*distribution.values()]
    )]

iplot(data, filename='distribution')

The wierd thing is that validation set does not have **silence**. So in order to train the silence detect model, I have to use the train set for both training and validation

#### b. audio length

In [13]:
sample_length_list = []

# read in every audio and count each length
for label, id, file in TRAIN_FILES + VALIDATION_FILES:
    sample_rate, samples = wavfile.read(TRAIN_DATA_PATH + file)
    sample_length_list.append(len(samples))


Chunk (non-data) not understood, skipping it.



In [14]:
sample_rate = 16000
sample_length_list = np.array(sample_length_list)
sample_length_list[sample_length_list > sample_rate]

array([ 960000,  960000,  988891, 1522930,  980062,  978488])

In [15]:
train_valid_files = np.array(TRAIN_FILES + VALIDATION_FILES)
large_audio_files = train_valid_files[sample_length_list > sample_rate]
print(large_audio_files)

[['silence' 'pink' '_background_noise_/pink_noise.wav']
 ['silence' 'white' '_background_noise_/white_noise.wav']
 ['silence' 'dude' '_background_noise_/dude_miaowing.wav']
 ['silence' 'doing' '_background_noise_/doing_the_dishes.wav']
 ['silence' 'exercise' '_background_noise_/exercise_bike.wav']
 ['silence' 'running' '_background_noise_/running_tap.wav']]


There are some really long audio. Hopefully, there are not too much and they are those background noise. Let's hear some of them.

In [16]:
ipd.Audio(filename=TRAIN_DATA_PATH + large_audio_files[4][2])

**Some ideas to handle it:**
- chop the long file into 1 second length
- mix different types of noise
- mix noise with audio file

In [None]:
sample_rate = 16000
sample_length_list = np.array(sample_length_list)
sample_length_list[sample_length_list < 0.5 * sample_rate]

There are some very short audio. Also not too much.

In [None]:
truncated_sample_length = sample_length_list[sample_length_list <= 3 * sample_rate]
data = [go.Histogram(x=truncated_sample_length)]
iplot(data)

In [None]:
small_sample_num = len(sample_length_list[sample_length_list < sample_rate])
print('There are about %d samples smaller than the sample rate' % small_sample_num)

**Some idea to handle short audio sample:**
- padding 0 to it
- stretch the audio to 1 seconds
- decompose it into phonome

In [35]:
def pad_to_middle(sample, sample_rate=16000):
    pad_num = sample_rate - len(sample)
    left = int(pad_num / 2)
    right = pad_num - left
    return np.pad(sample, (left, right), 'constant')

In [36]:
def chop_data(sample, sample_rate=16000):
    num = np.ceil(len(sample) / sample_rate).astype(np.int)
    pad_num = num * sample_rate - len(sample)
    return np.split(np.pad(sample, (0, pad_num), 'constant'), num)

In [37]:
def random_select(sample, sample_rate=16000):
    beg = np.random.randint(0, len(sample) - sample_rate)
    return sample[beg: beg + sample_rate]

In [38]:
def get_label(label):
    if label == 'silence':
        return 1
    return 0

In [19]:
train_data_list = []

# read file into memoery
if len(train_data_list) < 1:
    for label, id, fname in TRAIN_FILES:
        sample_rate, sample = wavfile.read(TRAIN_DATA_PATH + fname)
        train_data_list.append((label, id, sample))


Chunk (non-data) not understood, skipping it.



In [20]:
# extract the long silence samples
silence_samples = list(filter(lambda t: t[0] == 'silence', train_data_list))
non_silence_samples = list(filter(lambda t: t[0] != 'silence', train_data_list))

In [21]:
SLICED_SILENCE_TRAIN_FILES_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/short_silence_train')
SLICED_SILENCE_VALID_FILES_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/short_silence_valid')

os.makedirs(SLICED_SILENCE_TRAIN_FILES_PATH, exist_ok=True)
os.makedirs(SLICED_SILENCE_VALID_FILES_PATH, exist_ok=True)

In [22]:
# output silence sample from 80% of the noise sample for trainning set
if True:
    for i in range(len(non_silence_samples)):
        index = np.random.randint(0, len(silence_samples))
        if len(silence_samples[index][2]) < 16000:
            print(silence_samples[index][1])
        
        end = int(len(silence_samples[index][2]) * 0.8)
        random_sample = random_select(silence_samples[index][2][:end], 16000)
        title = silence_samples[index][0]
        file_name = title + '_' + str(index) + '_' + str(i) + '.wav'
        wavfile.write(os.path.join(SLICED_SILENCE_TRAIN_FILES_PATH, file_name), 16000, random_sample)

In [23]:
# output silence sample from 20% of the noise sample for validation set
if True:
    for i in range(len(non_silence_samples)):
        index = np.random.randint(0, len(silence_samples))
        if len(silence_samples[index][2]) < 16000:
            print(silence_samples[i])
        beg = int(len(silence_samples[index][2]) * 0.8)
        random_sample = random_select(silence_samples[index][2][beg:], 16000)
        title = silence_samples[index][0]
        file_name = title + '_' + str(index) + '_' + str(i) + '.wav'
        wavfile.write(os.path.join(SLICED_SILENCE_VALID_FILES_PATH, file_name), 16000, random_sample)

In [24]:
# load the silence data for training 
silence_data_train =[]

for file in glob(os.path.join(SLICED_SILENCE_TRAIN_FILES_PATH, '*wav')):
    _, sample = wavfile.read(file)
    r = sample_matcher.match(file)
    silence_data_train.append(('silence', r.group(2), sample))
    
# load the silence data for validation

silence_data_valid =[]

for file in glob(os.path.join(SLICED_SILENCE_VALID_FILES_PATH, '*wav')):
    _, sample = wavfile.read(file)
    r = sample_matcher.match(file)
    silence_data_valid.append(('silence', r.group(2), sample))

In [25]:
valid_data_list = []

if len(valid_data_list) < 1:
    for label, id, fname in VALIDATION_FILES:
        sample_rate, sample = wavfile.read(TRAIN_DATA_PATH + fname)
        valid_data_list.append((label, id, sample))

In [26]:
# there could be some more advanced way to mixup
silence_model_train_data = non_silence_samples + silence_data_train

# mixup can be used here
silence_model_valid_data = valid_data_list + silence_data_valid

In [27]:
def silence_train_input_genetator(data_list=silence_model_train_data, sample_rate=16000):
    """
    To train the model to detect silence.
    For silence data we have to find some
    way to geneate some in order to handle
    the misbalance problem.
    """

    np.random.shuffle(silence_model_train_data)
    def generator():
        for label, id, sample in data_list:
            try:
                sample = sample.astype(np.float32) / np.iinfo(np.int16).max
                if len(sample) < sample_rate:
                    samples = [pad_to_middle(sample)]
                elif len(sample) > sample_rate:
                    samples = chop_data(sample)
                else:
                    samples = [sample]
                for one_second_sample in samples:
                    yield dict(
                        target=np.int32(get_label(label)),
                        wav=one_second_sample
                    )
            except Exception as err:
                print(err, label, id, fname)

    
    return generator

In [28]:
def silence_eval_input_genetator(data_list=silence_model_valid_data, sample_rate=16000):
    """
    To train the model to detect silence.
    For silence data we have to find some
    way to geneate some in order to handle
    the misbalance problem.
    """
    def generator():
        for label, id, sample in data_list:
            try:
                sample = sample.astype(np.float32) / np.iinfo(np.int16).max
                if len(sample) < sample_rate:
                    samples = [pad_to_middle(sample)]
                elif len(sample) > sample_rate:
                    samples = [chop_data(sample)[0]]
                else:
                    samples = [sample]
                for one_second_sample in samples:
                    yield dict(
                        target=np.int32(get_label(label)),
                        wav=one_second_sample
                    )
            except Exception as err:
                print(err, label, id, fname)

    
    return generator


## 2. CNN with spectrum
The idea is to chop the audio data into different frame. Take the frequency info from each frame and put those into CNN. As mentioned before, the plan is to build 3 models: detect silence, detect unkown, classify the data. And combine them finally.

**a. the first model is to detect the silence**

In [None]:
from tensorflow.contrib import layers, signal
def silence_model_handler(features, labels, mode, params, config):
    
    ### ================ Define the CNN ==========================
    sample = features['wav']
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    x = tf.abs(signal.stft(sample, 400, 160))
    x = tf.stack([x], axis=3)
    x = tf.to_float(x)
    x = layers.batch_norm(x, is_training=is_training)
    for i in range(3):
        x = layers.conv2d(
            x, 8 * (2 ** i), 3, 1,
            normalizer_fn=layers.batch_norm,
            normalizer_params={'is_training': is_training}
        )
        x = layers.max_pool2d(x, 2, 2)
    x = layers.flatten(x)
    x = tf.layers.dense(x, 128, activation=tf.nn.relu)
    x = layers.dropout(x, keep_prob=params['keep_prob'] if is_training else 1.0)
    x = tf.layers.dense(x, 64, activation=tf.nn.relu)
    logits = tf.layers.dense(x, 2, activation=None)

    ### ===========================================================
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        train_op=layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            learning_rate=0.001,
            optimizer=tf.train.AdamOptimizer
        )
        
        specs = {'mode': mode, 'loss': loss, 'train_op': train_op}
    
    if mode == tf.estimator.ModeKeys.EVAL:
        prediction = tf.argmax(logits, axis=-1)
        acc, acc_op = tf.metrics.mean_per_class_accuracy(
            labels, prediction, 2)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        specs = dict(
                mode=mode,
                loss=loss,
                eval_metric_ops=dict(
                    acc=(acc, acc_op),
                )
        )
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'label': tf.argmax(logits, axis=-1),  
            'sample': features['wav'], 
        }
        specs = {
            'mode': mode,
            'predictions': predictions,
        }
    return tf.estimator.EstimatorSpec(**specs)

In [None]:
OUT_PATH = os.path.join('./', 'out')
os.makedirs(OUT_PATH, exist_ok=True)
params = {
    'keep_prob': True
}
run_config = tf.contrib.learn.RunConfig(
    model_dir=OUT_PATH, 
    log_step_count_steps=10,
    save_summary_steps=10
)
silence_model = tf.estimator.Estimator(
    model_fn=silence_model_handler, params=params, config=run_config)

In [None]:
# silence_model_input_fn = silence_train_input_from_memory(silence_model_train_data)
silence_model_input_fn = generator_input_fn(
    x=silence_train_input_genetator(silence_model_train_data),
    target_key='target',
    shuffle=True,
    num_epochs=None,
    batch_size=50,
    queue_capacity=3 * 50 + 10, num_threads=1,
)


In [None]:
# train_input = lambda: silence_train_input_from_memory(data_list)
silence_model.train(input_fn=silence_model_input_fn, steps=300)

In [None]:
silence_model_eval_input_fn = generator_input_fn(
    x=silence_eval_input_genetator(silence_model_valid_data),
    target_key='target'
)

In [None]:
silence_model.evaluate(input_fn=silence_model_eval_input_fn)

The Estimator from tensorflow is not very developer friendly, hard to learn and also need a lot of redundancy codes. So I switch to keras which is very friendly

In [None]:
silence_model = Sequential()
silence_model.add(BatchNormalization(input_shape=(99, 161,1)))
silence_model.add(Conv2D(32, (3, 3), activation='relu'))
silence_model.add(Conv2D(32, (3, 3), activation='relu'))
silence_model.add(MaxPooling2D(pool_size=(2, 2)))
silence_model.add(BatchNormalization())

silence_model.add(Conv2D(64, (3, 3), activation='relu'))
silence_model.add(Conv2D(64, (3, 3), activation='relu'))
silence_model.add(MaxPooling2D(pool_size=(2, 2)))
silence_model.add(BatchNormalization())

silence_model.add(Conv2D(256, (3, 3), activation='relu'))
silence_model.add(Conv2D(256, (3, 3), activation='relu'))
silence_model.add(MaxPooling2D(pool_size=(2, 2)))
silence_model.add(BatchNormalization())

silence_model.add(Conv2D(512, (3, 3), activation='relu'))
silence_model.add(Conv2D(512, (3, 3), activation='relu'))
silence_model.add(MaxPooling2D(pool_size=(2, 2)))
silence_model.add(BatchNormalization())
silence_model.add(Dropout(0.25))

silence_model.add(Flatten())
silence_model.add(Dense(256, activation='relu'))
silence_model.add(Dropout(0.5))
silence_model.add(Dense(128, activation='relu'))
silence_model.add(Dropout(0.5))
silence_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
silence_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [40]:
def log_specgram(audio, sample_rate=16000, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = scipy.signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)

In [30]:
def shuffle_onehot_spectrum_transform_silence(samples):
    np.random.shuffle(samples)
    
    x = list(map(lambda t: t[2], samples))
    y = list(map(lambda t: 1 if t[0] == 'silence' else 0, x))
    y = keras.utils.to_categorical(y, num_classes=2)
    
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum, y

In [None]:
silence_model_train_data_x_spectrum, silence_model_train_data_y = shuffle_onehot_spectrum_transform_silence(silence_model_train_data)


In [None]:
silence_model_valid_data_x_spectrum, silence_model_valid_data_y = shuffle_onehot_spectrum_transform_silence(silence_model_valid_data)


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'silence-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))

In [None]:
silence_model.fit(x=silence_model_train_data_x_spectrum, y=silence_model_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(silence_model_valid_data_x_spectrum, silence_model_valid_data_y)
                 )

The result is very high. But consider the validation, training set comes from the same source. The model may overfitting.

Mix up the silence and audio
(label, 0.8 * audio + 0.1 * noise1 + 0.1 * noise2) vs (silence, noise)

In [None]:
def mixup(audio, noises):
    noise_index_1 = np.random.randint(len(noises))
    noise_index_2 = np.random.randint(len(noises))
    if len(audio) < 16000:
        audio = pad_to_middle(audio)
    mixed_audio = 0.8 * audio + 0.1 * random_select(noises[noise_index_1]) + 0.1 * random_select(noises[noise_index_2])
    return mixed_audio

In [None]:
MIXUP_SILENCE_TRAIN_PATH = os.path.join(TEMP_DATA_PATH ,'augumented_data/mixup_silence_train')
MIXUP_SILENCE_VALID_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/mixup_silence_valid')

os.makedirs(MIXUP_SILENCE_TRAIN_PATH, exist_ok=True)
os.makedirs(MIXUP_SILENCE_VALID_PATH, exist_ok=True)

silence_train_samples = list(map(lambda t: t[2][:int(0.8 * len(t[2]))], silence_samples))
silence_valid_samples = list(map(lambda t: t[2][int(0.8 * len(t[2])):], silence_samples))


if False:
    for _ in range(int(0.3 * len(no_silence_train_data_list))):
        index = np.random.randint(len(no_silence_train_data_list))
        label, uid, sample = no_silence_train_data_list[index]
        mixed_sample = mixup(sample, silence_train_samples)
        file_name = 'label' + '_mixed_' + str(index) + '_' + str(uid) + '.wav'
        wavfile.write(os.path.join(MIXUP_SILENCE_TRAIN_PATH, file_name), 16000, mixed_sample)

if False:
    for _ in range(int(0.3 * len(valid_data_list))):
        index = np.random.randint(len(valid_data_list))
        label, uid, sample = valid_data_list[index]
        mixed_sample = mixup(sample, silence_train_samples)
        file_name = 'label' + '_mixed_' + str(index) + '_' + str(uid) + '.wav'
        wavfile.write(os.path.join(MIXUP_SILENCE_VALID_PATH, file_name), 16000, mixed_sample)

In [None]:
def load_path(path):
    files = glob(os.path.join(path, '*.wav'))
    samples = []
    for file in files:
        sample_rate, sample = wavfile.read(file)
        label, id = get_info_from_path(file)
        samples.append((label, id, sample))
    return samples

In [None]:
mixed_train_samples_no_silence = load_path(MIXUP_SILENCE_TRAIN_PATH)
mixed_valid_samples_no_silence = load_path(MIXUP_SILENCE_VALID_PATH)
np.random.shuffle(mixed_train_samples_no_silence)
np.random.shuffle(mixed_valid_samples_no_silence)

In [None]:
np.random.shuffle(silence_data_train)
np.random.shuffle(silence_data_valid)


mixed_train_samples = mixed_train_samples_no_silence + \
                        silence_data_train[:len(mixed_train_samples_no_silence)] + \
                        no_silence_train_data_list
mixed_valid_samples = mixed_valid_samples_no_silence + \
                        silence_data_valid[:len(mixed_valid_samples_no_silence)] + \
                        valid_data_list

In [None]:
mixed_train_x_spectrum, mixed_train_data_y = shuffle_onehot_spectrum_transform_silence(mixed_train_samples)

In [None]:
mixed_valid_data_x_spectrum, mixed_valid_data_y = shuffle_onehot_spectrum_transform_silence(mixed_valid_samples)

In [None]:
mixed_silence_model = Sequential()
mixed_silence_model.add(BatchNormalization(input_shape=(99, 161,1)))
mixed_silence_model.add(Conv2D(32, (3, 3), activation='relu'))
mixed_silence_model.add(Conv2D(32, (3, 3), activation='relu'))
mixed_silence_model.add(MaxPooling2D(pool_size=(2, 2)))
mixed_silence_model.add(BatchNormalization())

mixed_silence_model.add(Conv2D(64, (3, 3), activation='relu'))
mixed_silence_model.add(Conv2D(64, (3, 3), activation='relu'))
mixed_silence_model.add(MaxPooling2D(pool_size=(2, 2)))
mixed_silence_model.add(BatchNormalization())

mixed_silence_model.add(Conv2D(256, (3, 3), activation='relu'))
mixed_silence_model.add(Conv2D(256, (3, 3), activation='relu'))
mixed_silence_model.add(MaxPooling2D(pool_size=(2, 2)))
mixed_silence_model.add(BatchNormalization())

mixed_silence_model.add(Conv2D(512, (3, 3), activation='relu'))
mixed_silence_model.add(Conv2D(512, (3, 3), activation='relu'))
mixed_silence_model.add(MaxPooling2D(pool_size=(2, 2)))
mixed_silence_model.add(BatchNormalization())
mixed_silence_model.add(Dropout(0.25))

mixed_silence_model.add(Flatten())
mixed_silence_model.add(Dense(256, activation='relu'))
mixed_silence_model.add(Dropout(0.5))
mixed_silence_model.add(Dense(128, activation='relu'))
mixed_silence_model.add(Dropout(0.5))
mixed_silence_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
mixed_silence_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'mixed-silence-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))

In [None]:
mixed_silence_model.fit(x=mixed_train_x_spectrum, y=mixed_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(mixed_valid_data_x_spectrum, mixed_valid_data_y)
)


** b. the second model is to detect unknown words**

For this model, validation set has enough classess for use.

In [None]:
valid_data_list = []

# read file into memoery
if len(valid_data_list) < 1:
    for label, id, fname in VALIDATION_FILES:
        sample_rate, sample = wavfile.read(TRAIN_DATA_PATH + fname)
        valid_data_list.append((label, id, sample))

In [39]:
class_dict = {}
for label, uid, sample in non_silence_samples:
    if label not in class_dict:
        class_dict[label] = 1 if label == 'unknown' else 0
print(class_dict)

{'right': 0, 'unknown': 1, 'go': 0, 'no': 0, 'left': 0, 'stop': 0, 'up': 0, 'down': 0, 'yes': 0, 'on': 0, 'off': 0}


In [33]:
def shuffle_onehot_spectrum_transform(samples, class_dict=class_dict):
    np.random.shuffle(samples)
    
    x = list(map(lambda t: t[2], samples))
    y = list(map(lambda t: class_dict[t[0]], samples))
    y = keras.utils.to_categorical(y, num_classes=len(class_dict))
    
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum, y

In [51]:
train_x_spectrum, train_y_one_hot = shuffle_onehot_spectrum_transform(non_silence_samples)

In [53]:
valid_x_spectrum, valid_y_one_hot = shuffle_onehot_spectrum_transform(valid_data_list)

In [56]:
model = Sequential()
model.add(BatchNormalization(input_shape=(99, 161,1)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])


In [57]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join('out', 'checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
model.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=30, callbacks=[checkpoint])

Train on 51088 samples, validate on 6798 samples
Epoch 1/30
 2752/51088 [>.............................] - ETA: 1:04:17 - loss: 1.2361 - acc: 0.5352

KeyboardInterrupt: 

In [None]:
if False:
    model.save('./unknown_model')

**b-2. classify all non-silence data together**

In [31]:
class_dict = {}
for label, uid, sample in non_silence_samples:
    if label not in class_dict:
        class_dict[label] = len(class_dict)
print(class_dict)

{'right': 0, 'unknown': 1, 'left': 2, 'go': 3, 'down': 4, 'yes': 5, 'stop': 6, 'off': 7, 'on': 8, 'up': 9, 'no': 10}


In [34]:
train_x_spectrum, train_y_one_hot = shuffle_onehot_spectrum_transform(non_silence_samples, class_dict)

In [35]:
valid_x_spectrum, valid_y_one_hot = shuffle_onehot_spectrum_transform(valid_data_list, class_dict)

In [36]:
model = Sequential()
model.add(BatchNormalization(input_shape=(99, 161,1)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
# model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(11, activation='softmax'))

adam = Adam(lr=1e-5)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])


In [55]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join(TEMP_DATA_PATH, 'checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
model.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=3, callbacks=[checkpoint])

Train on 51088 samples, validate on 6798 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7ff3e844ba90>

In [56]:
TEST_FILES[:10]

[('unknown', '0c40e715', 'bed/0c40e715_nohash_0.wav'),
 ('unknown', '0ea0e2f4', 'bed/0ea0e2f4_nohash_0.wav'),
 ('unknown', '0ea0e2f4', 'bed/0ea0e2f4_nohash_1.wav'),
 ('unknown', '105a0eea', 'bed/105a0eea_nohash_0.wav'),
 ('unknown', '1528225c', 'bed/1528225c_nohash_0.wav'),
 ('unknown', '1528225c', 'bed/1528225c_nohash_1.wav'),
 ('unknown', '1528225c', 'bed/1528225c_nohash_2.wav'),
 ('unknown', '1528225c', 'bed/1528225c_nohash_3.wav'),
 ('unknown', '1b4c9b89', 'bed/1b4c9b89_nohash_0.wav'),
 ('unknown', '1cb788bc', 'bed/1cb788bc_nohash_0.wav')]

In [63]:
test_data = []

for label, uid, path in TEST_FILES:
    sample_rate, sample = wavfile.read(os.path.join(DATA_PATH, 'audio', path))
    test_data.append((label, uid, sample))

In [64]:
test_x_spectrum, test_y = shuffle_onehot_spectrum_transform(test_data, class_dict)

In [65]:
model.evaluate(x=test_x_spectrum, y=test_y)



[0.29035970008800566, 0.92977322612091318]

model.save(os.path.join(TEMP_DATA_PATH, 'class_11_model.'))

In [15]:
model = load_model(os.path.join(TEMP_DATA_PATH, 'class_11_model.model'))


Error in loading the saved optimizer state. As a result, your model is starting with a freshly initialized optimizer.



In [None]:
model.predict()

In [19]:
FINAL_TEST_FILE_PATH = '/output/test/audio'
final_test_files = glob(os.path.join(FINAL_TEST_FILE_PATH, '*wav'))

In [20]:
final_test_files[0]

'/output/test/audio/clip_14a11cb33.wav'

In [28]:
test_file_id_matcher = re.compile('_([^\.]+)\.wav')

In [29]:
test_samples_raw = []
for file_path in final_test_files:
    uid = test_file_id_matcher.search(file_path).group(1)
    sample_rate, sample = wavfile.read(file_path)
    test_samples_raw.append((uid, sample))

In [41]:
def test_spectrum_transform(samples):
    x = list(map(lambda t: t[1], samples))
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum

In [None]:
test_spectrum = test_spectrum_transform(test_samples_raw)

In [None]:
h5f = h5py.File('temp/data.h5', 'w')
h5f.create_dataset('test_spectrum', data=test_spectrum)
h5f.close()

In [None]:
result = model.predict(test_spectrum, verbose=1)



In [56]:
result[330]

array([  9.94288027e-01,   4.20869933e-03,   4.38242103e-04,
         4.57089345e-06,   1.31590596e-05,   7.55865221e-06,
         1.28409738e-05,   2.17010002e-05,   8.98147351e-04,
         9.82519414e-05,   8.67133622e-06], dtype=float32)

In [62]:
result_index = list(map(np.argmax, result))

In [65]:
uids = list(map(lambda t: t[0], test_samples_raw))

In [67]:
class_dict = {'right': 0, 'unknown': 1, 'left': 2, 'go': 3, 'down': 4, 'yes': 5, 'stop': 6, 'off': 7, 'on': 8, 'up': 9, 'no': 10}

In [73]:
class_reverse_dict = {}
for key, value in class_dict.items():
    class_reverse_dict[value] = key

In [75]:
final_result = []

for uid, index in zip(uids, result_index):
    final_result.append((uid, class_reverse_dict[index]))

In [78]:
import csv

In [79]:
with open('results.csv', 'w', newline='') as csvfile:
    fieldnames = ['fname', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for uid, label in final_result:
         writer.writerow({'fname': 'clip_' + uid + '.wav', 'label': label})