# Keyword speech recongnition

The idea is to use Convolution Neural Network to extract features of the framed specturm and classify the audio data.


## 1. Preprocessing

In [2]:
%matplotlib inline
import os
import re
from glob import glob

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np

from scipy.io import wavfile
import scipy
import numpy as np
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import IPython.display as ipd

import tensorflow as tf


from sklearn.model_selection import train_test_split
# from tensorflow.contrib.learn.python.learn.learn_io.generator_io import generator_input_fn

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization

from keras.optimizers import Adam
from scipy import signal

import h5py

sns.set(style="whitegrid", color_codes=True)
init_notebook_mode(connected=True)

Using TensorFlow backend.


In [35]:
DATA_PATH = '/audio_sample' # path to training data
TEMP_DATA_PATH = '/tmp/temp'
TRAIN_DATA_PATH = DATA_PATH+ '/audio/'

In [4]:
os.listdir(DATA_PATH)

['LICENSE',
 'validation_list.txt',
 'audio',
 'README.md',
 'testing_list.txt',
 '.DS_Store']

In [5]:
folders = []
for file_folder in os.listdir(TRAIN_DATA_PATH):
    if os.path.isdir(TRAIN_DATA_PATH + file_folder):
        folders.append(file_folder)
print(folders)
print('There are totally ' + str(len(folders)) + ' labels in the training dataset.')

['zero', 'right', 'off', 'seven', 'nine', 'dog', 'left', 'five', 'bird', 'two', 'three', 'one', 'happy', 'bed', '_background_noise_', 'on', 'house', 'six', 'yes', 'up', 'marvin', 'cat', 'wow', 'go', 'no', 'tree', 'sheila', 'down', 'eight', 'four', 'stop']
There are totally 31 labels in the training dataset.


However, from the competition note, there are only 12 labels we have to pay attention to.
> **Note**: There are only 12 possible labels for the Test set: yes, no, up, down, left, right, on, off, stop, go, silence, unknown.

> The unknown label should be used for a command that is not one one of the first 10 labels or that is not silence.

In [6]:
POSSIBLE_LABELS = 'yes, no, up, down, left, right, on, off, stop, go, silence, unknown'.replace(' ', '').split(',')
print(POSSIBLE_LABELS)

['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']


 The file *validation_list.txt* specifies the data for validation.

In [7]:
# read in validation files list and test files list
VALIDATION_LIST_FILE_PATH = DATA_PATH + '/validation_list.txt'
with open(VALIDATION_LIST_FILE_PATH, 'r') as file:
    VALIDATION_FILE_NAMES = [line.rstrip() for line in file]

TEST_LIST_FILE_PATH = DATA_PATH + '/testing_list.txt'
with open(TEST_LIST_FILE_PATH, 'r') as file:
    TEST_FILE_NAMES = [line.rstrip() for line in file]


In [8]:
sample_matcher = re.compile("(?:.+\/)?(\w+)\/([^_]+)_.+wav")
def get_info_from_path(path):
    """
        return (label, id)
    """
    r = sample_matcher.match(path)
    if not r:
        raise ValueError(path + ' is not valid file path')
    label = r.group(1)
    id = r.group(2)
    
    if label not in POSSIBLE_LABELS:
        if label == '_background_noise_':
            label = 'silence'
        else:
            label = 'unknown'
    return label, id

In [9]:
# extract info for validation and test files
VALIDATION_FILES = []
VALIDATIN_ID_SET = {}
for file in VALIDATION_FILE_NAMES:
    try: 
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    VALIDATION_FILES.append((label, id, file))
    VALIDATIN_ID_SET[id] = True

TEST_FILES = []
TEST_ID_SET = {}
for file in TEST_FILE_NAMES:
    try: 
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    TEST_FILES.append((label, id, file))
    TEST_ID_SET[id] = True

In [10]:
# train file is the file that neither in test or validation
file_path_matcher = re.compile("(.+\/)?(\w+\/[^_]+_.+wav)")
TRAIN_FILES = []
SAMPLE_FILES = glob(os.path.join(DATA_PATH, 'audio/*/*wav'))
for file in SAMPLE_FILES:
    try:
        label, id = get_info_from_path(file)
    except ValueError as e:
        print(e)
        continue
    if (id not in VALIDATIN_ID_SET) and (id not in TEST_ID_SET):
        TRAIN_FILES.append((label, id, file_path_matcher.match(file).group(2)))

In [11]:
print('Number of sample')
print('Total: %d' % len(SAMPLE_FILES))
print('Train: %d' % len(TRAIN_FILES))
print('Validation: %d' % len(VALIDATION_FILES))
print('Test: %d' % len(TEST_FILES))
number_sum = len(TRAIN_FILES) + len(VALIDATION_FILES) + len(TEST_FILES)
print('{} + {} + {} = {}'.format(len(TRAIN_FILES), len(VALIDATION_FILES), len(TEST_FILES), number_sum))

Number of sample
Total: 64727
Train: 51094
Validation: 6798
Test: 6835
51094 + 6798 + 6835 = 64727


### look at the data

#### a. distribution

In [None]:
# calculate the distribution of training and validation data
distribution = {}
for label, _, _ in TRAIN_FILES + VALIDATION_FILES:
    if label not in distribution:
        distribution[label] = 0
    distribution[label] = distribution[label] + 1

data = [go.Bar(
            x=[*distribution.keys()],
            y=[*distribution.values()]
    )]

iplot(data, filename='distribution')

When we look at the distribution of the data, **unknwon**, **silence** are two special labels. **unknown** has much more data than others, **silence** only has 6.


Ideas to handle the two special cases:
- train a model to seperate **silence** from the rest. Then we train another one to seperate **unknown** from the key words that we are interesting.

- bootstrap the other types

- throw away some of unknown words

In [None]:
distribution = {}
for label, _, _ in VALIDATION_FILES:
    if label not in distribution:
        distribution[label] = 0
    distribution[label] = distribution[label] + 1

data = [go.Bar(
            x=[*distribution.keys()],
            y=[*distribution.values()]
    )]

iplot(data, filename='distribution')

The wierd thing is that validation set does not have **silence**. So in order to train the silence detect model, I have to use the train set for both training and validation

#### b. audio length

In [None]:
sample_length_list = []

# read in every audio and count each length
for label, id, file in TRAIN_FILES + VALIDATION_FILES:
    sample_rate, samples = wavfile.read(TRAIN_DATA_PATH + file)
    sample_length_list.append(len(samples))

In [None]:
sample_rate = 16000
sample_length_list = np.array(sample_length_list)
sample_length_list[sample_length_list > sample_rate]

In [None]:
train_valid_files = np.array(TRAIN_FILES + VALIDATION_FILES)
large_audio_files = train_valid_files[sample_length_list > sample_rate]
print(large_audio_files)

There are some really long audio. Hopefully, there are not too much and they are those background noise. Let's hear some of them.

In [None]:
ipd.Audio(filename=TRAIN_DATA_PATH + large_audio_files[4][2])

**Some ideas to handle it:**
- chop the long file into 1 second length
- mix different types of noise
- mix noise with audio file

In [None]:
sample_rate = 16000
sample_length_list = np.array(sample_length_list)
sample_length_list[sample_length_list < 0.5 * sample_rate]

There are some very short audio. Also not too much.

In [None]:
truncated_sample_length = sample_length_list[sample_length_list <= 3 * sample_rate]
data = [go.Histogram(x=truncated_sample_length)]
iplot(data)

In [None]:
small_sample_num = len(sample_length_list[sample_length_list < sample_rate])
print('There are about %d samples smaller than the sample rate' % small_sample_num)

**Some idea to handle short audio sample:**
- padding 0 to it
- stretch the audio to 1 seconds
- decompose it into phonome

**Load the data**

In [12]:
def pad_to_middle(sample, sample_rate=16000):
    pad_num = sample_rate - len(sample)
    left = int(pad_num / 2)
    right = pad_num - left
    return np.pad(sample, (left, right), 'constant')

In [13]:
def chop_data(sample, sample_rate=16000):
    """
        chop the data by cutting it into small parts first
    """
    num = np.ceil(len(sample) / sample_rate).astype(np.int)
    pad_num = num * sample_rate - len(sample)
    return np.split(np.pad(sample, (0, pad_num), 'constant'), num)

In [14]:
def random_select(sample, sample_rate=16000):
    beg = np.random.randint(0, len(sample) - sample_rate)
    return sample[beg: beg + sample_rate]

In [15]:
def get_label(label):
    if label == 'silence':
        return 1
    return 0

**Load the training data**

In [16]:
train_data_list = []

# read file into memory
if len(train_data_list) < 1:
    for label, id, fname in TRAIN_FILES:
        sample_rate, sample = wavfile.read(TRAIN_DATA_PATH + fname)
        train_data_list.append((label, id, sample))


Chunk (non-data) not understood, skipping it.



In [17]:
# extract the silence samples
silence_samples = list(filter(lambda t: t[0] == 'silence', train_data_list))
non_silence_samples = list(filter(lambda t: t[0] != 'silence', train_data_list))

In [18]:
# mkdir for sliced silence samples
SLICED_SILENCE_TRAIN_FILES_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/short_silence_train')
SLICED_SILENCE_VALID_FILES_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/short_silence_valid')

os.makedirs(SLICED_SILENCE_TRAIN_FILES_PATH, exist_ok=True)
os.makedirs(SLICED_SILENCE_VALID_FILES_PATH, exist_ok=True)

In [19]:
# output silence sample from 80% of the noise sample for trainning set
if True:
    for i in range(len(non_silence_samples)):
        index = np.random.randint(0, len(silence_samples))
        if len(silence_samples[index][2]) < 16000:
            print(silence_samples[index][1])
        
        end = int(len(silence_samples[index][2]) * 0.8)
        random_sample = random_select(silence_samples[index][2][:end], 16000)
        title = silence_samples[index][0]
        file_name = title + '_' + str(index) + '_' + str(i) + '.wav'
        wavfile.write(os.path.join(SLICED_SILENCE_TRAIN_FILES_PATH, file_name), 16000, random_sample)

In [20]:
# output silence sample from 20% of the noise sample for validation set
if True:
    for i in range(len(non_silence_samples)):
        index = np.random.randint(0, len(silence_samples))
        if len(silence_samples[index][2]) < 16000:
            print(silence_samples[i])
        beg = int(len(silence_samples[index][2]) * 0.8)
        random_sample = random_select(silence_samples[index][2][beg:], 16000)
        title = silence_samples[index][0]
        file_name = title + '_' + str(index) + '_' + str(i) + '.wav'
        wavfile.write(os.path.join(SLICED_SILENCE_VALID_FILES_PATH, file_name), 16000, random_sample)

In [21]:
# load the silence data for training 
silence_data_train =[]

for file in glob(os.path.join(SLICED_SILENCE_TRAIN_FILES_PATH, '*wav')):
    _, sample = wavfile.read(file)
    r = sample_matcher.match(file)
    silence_data_train.append(('silence', r.group(2), sample))
    
# load the silence data for validation

silence_data_valid =[]

for file in glob(os.path.join(SLICED_SILENCE_VALID_FILES_PATH, '*wav')):
    _, sample = wavfile.read(file)
    r = sample_matcher.match(file)
    silence_data_valid.append(('silence', r.group(2), sample))
    
# print count

print('Totally we have %d silence training data' % len(silence_data_train))
print('Totally we have %d silence valid data' % len(silence_data_valid))

Totally we have 51088 silence training data
Totally we have 51088 silence valid data


**Load validation data**

In [22]:
valid_data_list = []

if len(valid_data_list) < 1:
    for label, id, fname in VALIDATION_FILES:
        sample_rate, sample = wavfile.read(TRAIN_DATA_PATH + fname)
        valid_data_list.append((label, id, sample))

**Load the test data**

In [98]:
FINAL_TEST_FILE_PATH = '/audio_test/test/audio'
FINAL_TEST_FILES = glob(os.path.join(FINAL_TEST_FILE_PATH, '*wav'))

In [99]:
test_samples_raw = []
for file_path in FINAL_TEST_FILES:
    uid = test_file_id_matcher.search(file_path).group(1)
    sample_rate, sample = wavfile.read(file_path)
    test_samples_raw.append((uid, sample))


## 2. CNN with spectrum
The idea is to chop the audio data into different frame. Take the frequency info from each frame and put those into CNN. As mentioned before, the plan is to build 3 models: detect silence, detect unkown, classify the data. And combine them finally.

**a. the first model is to detect the silence**

**TF Estimator**

In [25]:
# there could be some more advanced way to mixup
silence_model_train_data = non_silence_samples + silence_data_train

# mixup can be used here
silence_model_valid_data = valid_data_list + silence_data_valid

In [None]:
def silence_train_input_genetator(data_list=silence_model_train_data, sample_rate=16000):
    """
    To train the model to detect silence.
    For silence data we have to find some
    way to geneate some in order to handle
    the misbalance problem.
    """

    np.random.shuffle(silence_model_train_data)
    def generator():
        for label, id, sample in data_list:
            try:
                sample = sample.astype(np.float32) / np.iinfo(np.int16).max
                if len(sample) < sample_rate:
                    samples = [pad_to_middle(sample)]
                elif len(sample) > sample_rate:
                    samples = chop_data(sample)
                else:
                    samples = [sample]
                for one_second_sample in samples:
                    yield dict(
                        target=np.int32(get_label(label)),
                        wav=one_second_sample
                    )
            except Exception as err:
                print(err, label, id, fname)

    
    return generator

In [None]:
def silence_eval_input_genetator(data_list=silence_model_valid_data, sample_rate=16000):
    """
    To train the model to detect silence.
    For silence data we have to find some
    way to geneate some in order to handle
    the misbalance problem.
    """
    def generator():
        for label, id, sample in data_list:
            try:
                sample = sample.astype(np.float32) / np.iinfo(np.int16).max
                if len(sample) < sample_rate:
                    samples = [pad_to_middle(sample)]
                elif len(sample) > sample_rate:
                    samples = [chop_data(sample)[0]]
                else:
                    samples = [sample]
                for one_second_sample in samples:
                    yield dict(
                        target=np.int32(get_label(label)),
                        wav=one_second_sample
                    )
            except Exception as err:
                print(err, label, id, fname)

    
    return generator

In [None]:
from tensorflow.contrib import layers, signal
def silence_model_handler(features, labels, mode, params, config):
    
    ### ================ Define the CNN ==========================
    sample = features['wav']
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    x = tf.abs(signal.stft(sample, 400, 160))
    x = tf.stack([x], axis=3)
    x = tf.to_float(x)
    x = layers.batch_norm(x, is_training=is_training)
    for i in range(3):
        x = layers.conv2d(
            x, 8 * (2 ** i), 3, 1,
            normalizer_fn=layers.batch_norm,
            normalizer_params={'is_training': is_training}
        )
        x = layers.max_pool2d(x, 2, 2)
    x = layers.flatten(x)
    x = tf.layers.dense(x, 128, activation=tf.nn.relu)
    x = layers.dropout(x, keep_prob=params['keep_prob'] if is_training else 1.0)
    x = tf.layers.dense(x, 64, activation=tf.nn.relu)
    logits = tf.layers.dense(x, 2, activation=None)

    ### ===========================================================
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        train_op=layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            learning_rate=0.001,
            optimizer=tf.train.AdamOptimizer
        )
        
        specs = {'mode': mode, 'loss': loss, 'train_op': train_op}
    
    if mode == tf.estimator.ModeKeys.EVAL:
        prediction = tf.argmax(logits, axis=-1)
        acc, acc_op = tf.metrics.mean_per_class_accuracy(
            labels, prediction, 2)
        loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))
        specs = dict(
                mode=mode,
                loss=loss,
                eval_metric_ops=dict(
                    acc=(acc, acc_op),
                )
        )
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'label': tf.argmax(logits, axis=-1),  
            'sample': features['wav'], 
        }
        specs = {
            'mode': mode,
            'predictions': predictions,
        }
    return tf.estimator.EstimatorSpec(**specs)

In [None]:
OUT_PATH = os.path.join('./', 'out')
os.makedirs(OUT_PATH, exist_ok=True)
params = {
    'keep_prob': True
}
run_config = tf.contrib.learn.RunConfig(
    model_dir=OUT_PATH, 
    log_step_count_steps=10,
    save_summary_steps=10
)
silence_model = tf.estimator.Estimator(
    model_fn=silence_model_handler, params=params, config=run_config)

In [None]:
# silence_model_input_fn = silence_train_input_from_memory(silence_model_train_data)
silence_model_input_fn = generator_input_fn(
    x=silence_train_input_genetator(silence_model_train_data),
    target_key='target',
    shuffle=True,
    num_epochs=None,
    batch_size=50,
    queue_capacity=3 * 50 + 10, num_threads=1,
)


In [None]:
# train_input = lambda: silence_train_input_from_memory(data_list)
silence_model.train(input_fn=silence_model_input_fn, steps=300)

In [None]:
silence_model_eval_input_fn = generator_input_fn(
    x=silence_eval_input_genetator(silence_model_valid_data),
    target_key='target'
)

In [None]:
silence_model.evaluate(input_fn=silence_model_eval_input_fn)

The Estimator from tensorflow is not very developer friendly, hard to learn and also need a lot of redundancy codes. So I switch to keras which is very friendly

**Keras**

In [26]:
def log_specgram(audio, sample_rate=16000, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = scipy.signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return np.log(spec.T.astype(np.float32) + eps)

In [27]:
def shuffle_onehot_spectrum_transform_silence(samples):
    np.random.shuffle(samples)
    
    x = list(map(lambda t: t[2], samples))
    y = list(map(lambda t: 1 if t[0] == 'silence' else 0, samples))
    y = keras.utils.to_categorical(y, num_classes=2)
    
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum, y

In [28]:
silence_model_train_data_x_spectrum, silence_model_train_data_y = shuffle_onehot_spectrum_transform_silence(silence_model_train_data)


In [29]:
silence_model_valid_data_x_spectrum, silence_model_valid_data_y = shuffle_onehot_spectrum_transform_silence(silence_model_valid_data)


In [30]:
silence_model_h5py = h5py.File('silence_model_data')
silence_model_h5py.create_dataset('silence_model_train_data_x_spectrum', data=silence_model_train_data_x_spectrum)
silence_model_h5py.create_dataset('silence_model_train_data_y', data=silence_model_train_data_y)
silence_model_h5py.create_dataset('silence_model_valid_data_x_spectrum', data=silence_model_valid_data_x_spectrum)
silence_model_h5py.create_dataset('silence_model_valid_data_y', data=silence_model_valid_data_y)
silence_model_h5py.close()

**Vgg model 1**

In [31]:
def vgg_layers_1(model):
    model.add(BatchNormalization(input_shape=(99, 161,1)))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())

    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))

    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))

silence_model = Sequential()
vgg_layers_1(silence_model)
silence_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
silence_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [32]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'silence-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))

In [33]:
silence_model.fit(x=silence_model_train_data_x_spectrum, y=silence_model_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(silence_model_valid_data_x_spectrum, silence_model_valid_data_y)
                 )

Train on 102176 samples, validate on 57886 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb726298668>

In [36]:
silence_model.save('silence_model')

The vgg model above is not suitable for audio recongnition (it is designed for computer vision). So I also tried the following conv net from [google tensorflow speech command](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/speech_commands/models.py).

**vgg model 2 from google speech command**

In [38]:
def vgg_google(model):
    model.add(BatchNormalization(input_shape=(99, 161,1)))
    model.add(Conv2D(64, (8, 20), activation='relu'))
    model.add(Dropout(0.25))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (4, 10), activation='relu', padding='same'))
    model.add(Dropout(0.25))

    model.add(Flatten())

    
silence_model_google = Sequential()
vgg_google(silence_model_google)
silence_model_google.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
silence_model_google.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [39]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'silence-google-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))
silence_model_google.fit(x=silence_model_train_data_x_spectrum, y=silence_model_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(silence_model_valid_data_x_spectrum, silence_model_valid_data_y)
)

Train on 102176 samples, validate on 57886 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7202a5908>

In [41]:
silence_model_google.save('silence_model_google')

The result is very high. But consider the validation, training set comes from the same source. The model may overfitting.

**mixed silence with samples model**

Mix up the silence and audio
(label, 0.8 * audio + 0.1 * noise1 + 0.1 * noise2) vs (silence, noise)

In [42]:
def mixup(audio, noises):
    noise_index_1 = np.random.randint(len(noises))
    noise_index_2 = np.random.randint(len(noises))
    if len(audio) < 16000:
        audio = pad_to_middle(audio)
    mixed_audio = 0.8 * audio + 0.1 * random_select(noises[noise_index_1]) + 0.1 * random_select(noises[noise_index_2])
    return mixed_audio

In [43]:
MIXUP_SILENCE_TRAIN_PATH = os.path.join(TEMP_DATA_PATH ,'augumented_data/mixup_silence_train')
MIXUP_SILENCE_VALID_PATH = os.path.join(TEMP_DATA_PATH, 'augumented_data/mixup_silence_valid')

os.makedirs(MIXUP_SILENCE_TRAIN_PATH, exist_ok=True)
os.makedirs(MIXUP_SILENCE_VALID_PATH, exist_ok=True)

silence_train_samples = list(map(lambda t: t[2][:int(0.8 * len(t[2]))], silence_samples))
silence_valid_samples = list(map(lambda t: t[2][int(0.8 * len(t[2])):], silence_samples))


if True:
    for _ in range(int(0.3 * len(non_silence_samples))):
        index = np.random.randint(len(non_silence_samples))
        label, uid, sample = non_silence_samples[index]
        mixed_sample = mixup(sample, silence_train_samples)
        file_name = 'label' + '_mixed_' + str(index) + '_' + str(uid) + '.wav'
        wavfile.write(os.path.join(MIXUP_SILENCE_TRAIN_PATH, file_name), 16000, mixed_sample)

if True:
    for _ in range(int(0.3 * len(valid_data_list))):
        index = np.random.randint(len(valid_data_list))
        label, uid, sample = valid_data_list[index]
        mixed_sample = mixup(sample, silence_train_samples)
        file_name = 'label' + '_mixed_' + str(index) + '_' + str(uid) + '.wav'
        wavfile.write(os.path.join(MIXUP_SILENCE_VALID_PATH, file_name), 16000, mixed_sample)

In [44]:
def load_path(path):
    files = glob(os.path.join(path, '*.wav'))
    samples = []
    for file in files:
        sample_rate, sample = wavfile.read(file)
        label, id = get_info_from_path(file)
        samples.append((label, id, sample))
    return samples

In [45]:
mixed_train_samples_no_silence = load_path(MIXUP_SILENCE_TRAIN_PATH)
mixed_valid_samples_no_silence = load_path(MIXUP_SILENCE_VALID_PATH)
np.random.shuffle(mixed_train_samples_no_silence)
np.random.shuffle(mixed_valid_samples_no_silence)
print('Mixed silence train number %d' % len(mixed_train_samples_no_silence))
print('Mixed silence valid number %d' % len(mixed_valid_samples_no_silence))

Mixed silence train number 13225
Mixed silence valid number 1763


In [46]:
np.random.shuffle(silence_data_train)
np.random.shuffle(silence_data_valid)


mixed_train_samples = mixed_train_samples_no_silence + \
                        silence_data_train[:len(mixed_train_samples_no_silence)] + \
                        non_silence_samples
mixed_valid_samples = mixed_valid_samples_no_silence + \
                        silence_data_valid[:len(mixed_valid_samples_no_silence)] + \
                        valid_data_list

In [47]:
mixed_train_x_spectrum, mixed_train_data_y = shuffle_onehot_spectrum_transform_silence(mixed_train_samples)

In [48]:
mixed_valid_data_x_spectrum, mixed_valid_data_y = shuffle_onehot_spectrum_transform_silence(mixed_valid_samples)

In [49]:
mixed_silence_data = h5py.File('mixed_silence_data')
mixed_silence_data.create_dataset('train_x_spectrum', data=mixed_train_x_spectrum)
mixed_silence_data.create_dataset('train_y', data=mixed_train_data_y)
mixed_silence_data.create_dataset('valid_x_spectrum', data=mixed_train_x_spectrum)
mixed_silence_data.create_dataset('valid_y', data=mixed_valid_data_y)
mixed_silence_data.close()

In [50]:
mixed_silence_model = Sequential()
vgg_layers_1(mixed_silence_model)
mixed_silence_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
mixed_silence_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [51]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'mixed-silence-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))

In [52]:
mixed_silence_model.fit(x=mixed_train_x_spectrum, y=mixed_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(mixed_valid_data_x_spectrum, mixed_valid_data_y)
)

Train on 77538 samples, validate on 10324 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb711524d68>

In [53]:
mixed_silence_model.save('mixed_silence_model')

**google speech**

In [54]:
mixed_silence_google_model = Sequential()
vgg_google(mixed_silence_google_model)
mixed_silence_google_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
mixed_silence_google_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [55]:
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join('./', 'mixed-silence-google-checkpoint-{epoch:02d}-{acc:.2f}.hdf5'))

In [56]:
mixed_silence_google_model.fit(x=mixed_train_x_spectrum, y=mixed_train_data_y, 
                  batch_size=32, epochs=5, callbacks=[checkpoint],
                  validation_data=(mixed_valid_data_x_spectrum, mixed_valid_data_y)
)

Train on 77538 samples, validate on 10324 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb19b59ce10>

In [58]:
mixed_silence_google_model.save('mixed_silence_google_model')


** b. the second model is to detect unknown words**

For this model, validation set has enough classess for use.

In [59]:
# set a class label for the data, 1: unknown, 0: others 
class_dict = {}
for label, uid, sample in non_silence_samples:
    if label not in class_dict:
        class_dict[label] = 1 if label == 'unknown' else 0
print(class_dict)

{'unknown': 1, 'right': 0, 'off': 0, 'left': 0, 'on': 0, 'yes': 0, 'up': 0, 'go': 0, 'no': 0, 'down': 0, 'stop': 0}


In [60]:
def shuffle_onehot_spectrum_transform(samples, class_dict=class_dict):
    np.random.shuffle(samples)
    
    x = list(map(lambda t: t[2], samples))
    y = list(map(lambda t: class_dict[t[0]], samples))
    y = keras.utils.to_categorical(y, num_classes=len(class_dict))
    
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum, y

In [None]:
train_x_spectrum, train_y_one_hot = shuffle_onehot_spectrum_transform(non_silence_samples)

In [None]:
valid_x_spectrum, valid_y_one_hot = shuffle_onehot_spectrum_transform(valid_data_list)

In [None]:
unknown_model = Sequential()
vgg_layers_1(unknown_model)
unknown_model.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
unknown_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join('out', 'checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
unknown_model.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=30, callbacks=[checkpoint])

In [None]:
if False:
    unknown_model.save('./unknown_model')

**google vgg**

In [None]:
unknown_model_google = Sequential()
vgg_layers_1(unknown_model_google)
unknown_model_google.add(Dense(2, activation='softmax'))

adam = Adam(lr=1e-5)
unknown_model_google.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join('out', 'checkpoint-unknown_model_google-{epoch:02d}-{val_loss:.2f}.hdf5'))
unknown_model_google.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=30, callbacks=[checkpoint])

In [None]:
if False:
    unknown_model_google.save('./unknown_model_google')

**b-2. classify all non-silence data together**

In [61]:
class_dict = {}
for label, uid, sample in non_silence_samples:
    if label not in class_dict:
        class_dict[label] = len(class_dict)
print(class_dict)

{'unknown': 0, 'right': 1, 'off': 2, 'left': 3, 'on': 4, 'yes': 5, 'up': 6, 'go': 7, 'no': 8, 'down': 9, 'stop': 10}


In [62]:
train_x_spectrum, train_y_one_hot = shuffle_onehot_spectrum_transform(non_silence_samples, class_dict)

In [63]:
valid_x_spectrum, valid_y_one_hot = shuffle_onehot_spectrum_transform(valid_data_list, class_dict)

**vgg**

In [64]:
classes_model = Sequential()
vgg_layers_1(classes_model)
classes_model.add(Dense(11, activation='softmax'))

adam = Adam(lr=1e-4)
classes_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])


In [69]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join(TEMP_DATA_PATH, 'classes-model-checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
classes_model.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=5, callbacks=[checkpoint])

Train on 51088 samples, validate on 6798 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb196f0fc50>

In [70]:
classes_model.save('classes_model')

**google**

In [71]:
classes_silence_model_google = Sequential()
vgg_google(classes_silence_model_google)
classes_silence_model_google.add(Dense(11, activation='softmax'))

adam = Adam(lr=1e-5)
classes_silence_model_google.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc'])

In [73]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join(TEMP_DATA_PATH, 'classes_silence_model_google-checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
classes_silence_model_google.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=3, callbacks=[checkpoint])

Train on 51088 samples, validate on 6798 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb1963b9240>

In [76]:
classes_silence_model_google.save('classes_silence_model_google_model')

**rnn**

In [77]:
from keras.layers import Merge, Input, merge
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape, Permute,Lambda, RepeatVector
from keras.layers.convolutional import ZeroPadding2D, AveragePooling2D, Conv2D,MaxPooling2D, Convolution1D,MaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import Multiply
from keras.layers import LSTM, SimpleRNN, GRU, TimeDistributed, Bidirectional
from keras import backend as K
from keras.models import Sequential,Model, load_model

In [88]:
n_time = 99
n_freq = 161
num_classes = 11

def block(input):
    cnn = Conv2D(128, (3, 3), padding="same", activation="linear", use_bias=False)(input)
    cnn = BatchNormalization(axis=-1)(cnn)

    cnn1 = Lambda(slice1, output_shape=slice1_output_shape)(cnn)
    cnn2 = Lambda(slice2, output_shape=slice2_output_shape)(cnn)

    cnn1 = Activation('linear')(cnn1)
    cnn2 = Activation('sigmoid')(cnn2)

    out = Multiply()([cnn1, cnn2])
    return out

def slice1(x):
    return x[:, :, :, 0:64]

def slice2(x):
    return x[:, :, :, 64:128]

def slice1_output_shape(input_shape):
    return tuple([input_shape[0],input_shape[1],input_shape[2],64])

def slice2_output_shape(input_shape):
    return tuple([input_shape[0],input_shape[1],input_shape[2],64])

# Attention weighted sum
def outfunc(vects):
    cla, att = vects    # (N, n_time, n_out), (N, n_time, n_out)
    att = K.clip(att, 1e-7, 1.)
    out = K.sum(cla * att, axis=1) / K.sum(att, axis=1)     # (N, n_out)
    return out

def rnn(model,num_classes):
    a1 = Reshape((n_time, n_freq, 1))(model) # (N, 99, 161, 1)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 99, 81, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 99, 41, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 99, 21, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 99, 11, 128)
    
    a1 = block(a1)
    a1 = block(a1)
    a1 = MaxPooling2D(pool_size=(1, 2))(a1) # (N, 99, 5, 128)
    
    a1 = Conv2D(256, (3, 3), padding="same", activation="relu", use_bias=True)(a1)
    a1 = MaxPooling2D(pool_size=(1, 5))(a1) # (N, 99, 1, 256)
    
    a1 = Reshape((99, 256))(a1) # (N, 99, 256)
    
    # Gated BGRU
    rnnout = Bidirectional(GRU(128, activation='linear', return_sequences=True))(a1)
    rnnout_gate = Bidirectional(GRU(128, activation='sigmoid', return_sequences=True))(a1)
    a2 = Multiply()([rnnout, rnnout_gate])
    
    # Attention
    cla = TimeDistributed(Dense(num_classes, activation='sigmoid'), name='localization_layer')(a2)
    att = TimeDistributed(Dense(num_classes, activation='softmax'))(a2)
    out = Lambda(outfunc, output_shape=(num_classes,))([cla, att])
    return Model(input_logmel, out)

input_logmel = Input(shape=(n_time, n_freq, 1), name='in_layer')   # (N, 99, 161)
rnn_model = rnn(input_logmel, num_classes)
rnn_model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
checkpoint = keras.callbacks.ModelCheckpoint(filepath=os.path.join(TEMP_DATA_PATH, 'rnn_model-checkpoint-unknown-{epoch:02d}-{val_loss:.2f}.hdf5'))
rnn_model.fit(train_x_spectrum, train_y_one_hot, validation_data=(valid_x_spectrum, valid_y_one_hot), 
          batch_size=32, epochs=3, callbacks=[checkpoint])

Train on 51088 samples, validate on 6798 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3

In [90]:
rnn_model.save('rnn_model')

In [91]:
test_data = []

for label, uid, path in TEST_FILES:
    sample_rate, sample = wavfile.read(os.path.join(DATA_PATH, 'audio', path))
    test_data.append((label, uid, sample))

In [92]:
test_x_spectrum, test_y = shuffle_onehot_spectrum_transform(test_data, class_dict)

In [93]:
rnn_model.evaluate(x=test_x_spectrum, y=test_y)



[0.032526497917976051, 0.9889605747493585]

** get Kaggle test score **

In [100]:
test_file_id_matcher = re.compile('_([^\.]+)\.wav')

In [101]:
def test_spectrum_transform(samples):
    x = list(map(lambda t: t[1], samples))
    x = list(map(pad_to_middle, x))
    spectrum = np.stack(map(log_specgram, x))
    spectrum = np.array(spectrum)
    spectrum = spectrum.reshape(-1, 99, 161, 1)
    
    return spectrum

In [102]:
test_spectrum = test_spectrum_transform(test_samples_raw)

MemoryError: 

In [None]:
test_spectrum_h5f = h5py.File('temp/test_spectrum.h5', 'w')
test_spectrum_h5f.create_dataset('test_spectrum', data=test_spectrum)
test_spectrum_h5f.close()

In [None]:
test_samples_raw[:10]

In [None]:
uids = np.array([item[0] for item in test_samples_raw]).astype('ascii')

In [None]:
uids_h5f = h5py.File('temp/uids.h5', 'w')
uids_h5f.create_dataset('uids', data=uids)
uids_h5f.close()

In [None]:
test_spectrum = h5py.File('temp/test_spectrum.h5', 'r').get('test_spectrum')
test_samples_raw = h5py.File('temp/test_samples_raw.h5', 'r').get('test_samples_raw')

In [None]:
silence_model = load_model('silence_model')

In [None]:
silence_result = silence_model.predict(test_spectrum, verbose=1)

In [None]:
len(silence_result)

In [None]:
silence_result_index = [np.argmax(t) for t in silence_result]

In [None]:
sum(silence_result_index)

In [None]:
silence_model_google= load_model('silence_model_google')

In [None]:
silence_model_google_result = silence_model_google.predict(test_spectrum, verbose=1)

In [None]:
silence_model_google_result_index = [np.argmax(t) for t in silence_model_google_result]
len(silence_model_google_result_index)

In [None]:
sum(silence_model_google_result_index)

In [None]:
mixed_silence_model = load_model('mixed_silence_model')

In [None]:
mixed_silence_model_result = mixed_silence_model.predict(test_spectrum)

In [None]:
mixed_silence_model_result_index = list(map(np.argmax, mixed_silence_model_result))

In [None]:
sum(mixed_silence_model_result_index)

In [None]:
classes_model = load_model('classes_model')

In [None]:
result = classes_model.predict(test_spectrum, verbose=1)

In [None]:
result_index = [ np.argmax(t) for t in result ]

In [None]:
uids = [t[0] for t in test_samples_raw]

In [None]:
class_dict = {'unknown': 0, 'yes': 1, 'right': 2, 'stop': 3, 'up': 4, 'left': 5, 'down': 6, 'off': 7, 'go': 8, 'on': 9, 'no': 10}

In [None]:
class_reverse_dict = {}
for key, value in class_dict.items():
    class_reverse_dict[value] = key

In [None]:
final_result = []

for uid, index, silence_index in zip(uids, result_index, silence_result_index):
    if silence_index == 1:
        
    
    final_result.append((uid, class_reverse_dict[index]))

In [None]:
import csv

In [None]:
with open('results.csv', 'w', newline='') as csvfile:
    fieldnames = ['fname', 'label']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for uid, label in final_result:
         writer.writerow({'fname': 'clip_' + uid + '.wav', 'label': label})