In [1]:
import os
import re
from glob import glob
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import cv2

from scipy import signal
import random
import tensorflow as tf
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Conv2D, MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.utils import to_categorical

from keras_tqdm import TQDMNotebookCallback
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from keras.regularizers import l2
from keras.layers.merge import Concatenate
from keras.layers import Input, Dense, Conv2D, AveragePooling2D, Activation, GlobalAveragePooling2D, Lambda
from keras.layers.core import Dropout
from keras.layers.merge import Concatenate
from keras.layers.normalization import BatchNormalization
from keras.initializers import Initializer
from keras.regularizers import l2
from keras.models import Model
from keras.optimizers import Adam
from PIL import Image
import scipy.misc
import matplotlib.pyplot as plt
from scipy.misc import imsave, imresize
import pydot
import graphviz
from keras.utils import plot_model

%matplotlib inline

Using TensorFlow backend.


In [2]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}
len(id2name)

12

In [3]:
DATADIR = './data' # unzipped train and test data
OUTDIR = './model-k' # just a random name

POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}


def load_data(data_dir):
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))

    possible = set(POSSIBLE_LABELS)
    train, val, bg_noise = [], [], []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            if label == '_background_noise_':
                bg_noise.append(entry)
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]

            sample = (label_id, uid, entry)
            if uid in valset:
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train, {} val, and {} bg noise samples'.format(len(train), len(val), len(bg_noise)))
    return train, val, bg_noise

train_df, valid_df, noise_df = load_data(DATADIR)

There are 57929 train, 6798 val, and 6 bg noise samples


In [14]:
def process_wav_file(origWav):
    wav = np.copy(origWav)
#     wav = read_wav_file(fname)
    
    L = 16000  # 1 sec
    
    if len(wav) > L:
        i = np.random.randint(0, len(wav) - L)
        wav = wav[i:(i+L)]
    elif len(wav) < L:
        pad_len = L - len(wav)
        silence_part_left  = np.random.uniform(-0.001,0.001,int(pad_len/2.))
        silence_part_right = np.random.uniform(-0.001,0.001,int(np.ceil(pad_len/2.)))
        wav = np.concatenate([silence_part_left, wav, silence_part_right])
        
    wav = signal.resample(wav, int(0.5 * wav.shape[0]))
    
    specgram = signal.stft(wav, 16000, nperseg = 400, noverlap = 240, nfft = 512, padded = False, boundary = None)
    
    phase = np.angle(specgram[2]) / np.pi
    amp = np.log1p(np.abs(specgram[2]))
    
#     shape = (96,32)
#     phase = imresize(phase, shape, mode='F')
#     amp = imresize(amp, shape, mode='F')
    
    stacked = np.stack([phase, amp], axis = 2)
    return stacked

wav = wav_read(train_df[np.random.randint(len(train_df))][2])
p = process_wav_file(wav)
a,b = cv2.split(p)
shape = (96,32)
a = a.astype(np.float32)
b = b.astype(np.float32)
print b.dtype

log_spect = np.log(a)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.show()

a = imresize(a, shape, mode='F')

log_spect = np.log(a)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.show()

log_spect = np.log(b)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.show()

b = imresize(b, shape, mode='F')

log_spect = np.log(b)
print('spectrogram shape:', log_spect.shape)
plt.imshow(log_spect, aspect='auto', origin='lower',)
plt.show()

In [5]:
def wav_read(fname):
    wav, _ = librosa.load(fname, sr=None)
    return wav

def normalize_audio(wav):
    return wav/max(wav)

def time_shift(wav, shift):
    start_ = int(shift)
    if start_ >= 0:
        wav_time_shift = np.r_[wav[start_:], np.random.uniform(-0.001,0.001, start_)]
    else:
        wav_time_shift = np.r_[np.random.uniform(-0.001,0.001, -start_), wav[:start_]]
    return normalize_audio(wav_time_shift)

def speed_change(wav, speed_rate):
    # rate: lower is faster
    wav_speed_tune = cv2.resize(wav, (1, int(len(wav) * speed_rate))).squeeze()
    if len(wav_speed_tune) < 16000:
        pad_len = 16000 - len(wav_speed_tune)
        wav_speed_tune = np.r_[np.random.uniform(-0.001,0.001,int(pad_len/2.)),
                               wav_speed_tune,
                               np.random.uniform(-0.001,0.001,int(np.ceil(pad_len/2.)))]
    else: 
        cut_len = len(wav_speed_tune) - 16000
        wav_speed_tune = wav_speed_tune[int(cut_len/2.):int(cut_len/2.)+16000]
    return normalize_audio(wav_speed_tune)

def noise_add(wav, percent, ind):
    bg = wav_read(noise_df[ind])
    bg = normalize_audio(bg)
    start_ = np.random.randint(bg.shape[0]-16000)
    bg_slice = bg[start_ : start_+16000]
    wav_with_bg = wav * percent + bg_slice * (1-percent)
    return normalize_audio(wav_with_bg)

def get_spectrogram(wav):
    v = 600
    D = librosa.stft(wav, n_fft=v, hop_length=50,
                     win_length=v, window='hamming')
    spect, phase = librosa.magphase(D)
    spect = scipy.ndimage.zoom(spect,1./7, order=1)
    spect = spect.reshape(np.expand_dims(spect, axis=2).shape)
    return spect

def frange(x, y, jump):
    while x < y:
        yield x
        x += jump
        
def all_aug(wav, params):
    time, speed, noise_percent, noise_ind = params
    aug = wav
    aug = time_shift(aug, time)
    aug = speed_change(wav, speed)
    if noise_ind == -1:
        aug = noise_add(aug, 1, noise_ind)
    else:
        aug = noise_add(aug, noise_percent, noise_ind)
    return aug

In [25]:
#Define model parameters
model_depth = 16
num_dense_blocks = 3
growth_rate = 12
number_filters = 16
compression = 0.5
num_layers_per_block = (model_depth - 4) // num_dense_blocks

def dense_block(x,num_layers_per_block,growth_rate):
    for i in range(num_layers_per_block//2):
        x_ = BatchNormalization()(x)
        x_ = Activation('relu')(x_)
        x_ = Conv2D(number_filters,(3,3),padding='same',kernel_initializer='he_normal',kernel_regularizer=l2(0.0001))(x_)
        x_ = Dropout(0.2)(x_)
        x_ = BatchNormalization()(x_)
        x_ = Activation('relu')(x_)
        x = Concatenate()([x,x_])
    return x 

def transition_layers(x,compression):
    updated_num_filters = int(x.get_shape().as_list()[-1] * compression)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(updated_num_filters,(1,1),padding='same',kernel_initializer='he_normal',kernel_regularizer=l2(0.0001))(x)
    x = Dropout(0.2)(x)
    x = AveragePooling2D()(x)
    return x

#Let's define the model
inp = Input(shape = process_wav_file(wav_read(train_df[0][2])).shape)
x = Conv2D(number_filters,(1,1),padding='same',kernel_initializer='he_normal',kernel_regularizer=l2(0.0001))(inp)
for i in range(num_dense_blocks):
    x = dense_block(x,num_layers_per_block,growth_rate)
    if (i != num_dense_blocks-1):
        x = transition_layers(x,compression)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = GlobalAveragePooling2D()(x)
x = Dense(32, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(len(POSSIBLE_LABELS), activation='softmax',kernel_initializer='he_normal',kernel_regularizer=l2(0.0001))(x)

model = Model(inp, x)
# model.compile(Adam(), loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.compile(Adam(), loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()
plot_model(model, to_file='model.png')

# x_in = Input(shape = process_wav_file(wav_read(train_df[0][2])).shape)
# x = BatchNormalization()(x_in)
# for i in range(4):
#     x = Conv2D(16*(2 ** i),(3,3),padding='same',kernel_initializer='he_normal',kernel_regularizer=l2(0.0001))(x)
# #     x = Conv2D(16*(2 ** i), (3,3))(x)
#     x = Activation('elu')(x)
#     x = BatchNormalization()(x)
#     x = MaxPooling2D((2,2))(x)
# x = Conv2D(128, (1,1))(x)
# x_branch_1 = GlobalAveragePooling2D()(x)
# x_branch_2 = GlobalMaxPool2D()(x)
# x = concatenate([x_branch_1, x_branch_2])
# x = Dense(256, activation = 'relu')(x)
# x = Dropout(0.5)(x)
# x = Dense(len(POSSIBLE_LABELS), activation = 'softmax')(x)
# model = Model(inputs = x_in, outputs = x)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 257, 48, 2)   0                                            
__________________________________________________________________________________________________
conv2d_57 (Conv2D)              (None, 257, 48, 16)  48          input_8[0][0]                    
__________________________________________________________________________________________________
batch_normalization_93 (BatchNo (None, 257, 48, 16)  64          conv2d_57[0][0]                  
__________________________________________________________________________________________________
activation_92 (Activation)      (None, 257, 48, 16)  0           batch_normalization_93[0][0]     
__________________________________________________________________________________________________
conv2d_58 

In [26]:
DIV = 100.0

def get_variations():
    variations = [[0,1,1,-1]]

#     time_shift_range = 4000
#     for time in xrange(-time_shift_range, time_shift_range+1, 1000):
#         for speed in frange(0.4,1.7,0.3):
#             for noise_percentage in frange(0.2,1,0.1):
#                 for noise_ind in xrange(-1,len(noise_df)):
#                     variations.append([time, speed, noise_percentage, noise_ind])
    return variations

def get_variations_valid():
    variations = [[0,1,1,-1]]

    time_shift_range = 4000
    for time in xrange(-time_shift_range, time_shift_range+1, 1000):
        for speed in frange(0.4,1.7,0.3):
            for noise_percentage in frange(0.2,1,0.1):
                for noise_ind in xrange(-1,len(noise_df)):
                    variations.append([time, speed, noise_percentage, noise_ind])
    return variations

var = get_variations()
multiplier = np.ceil(len(var)/DIV)

def train_generator(train_batch_size):
    while True:
        variations = get_variations()
        len_var = len(variations)
        selected_indices = np.random.choice(len_var, int(np.ceil(len_var/DIV)))
        
        wavs = []
        tmp_train_df = np.array(train_df)
        np.random.shuffle(tmp_train_df)
        tmp_train_df = tmp_train_df.tolist()
        
        while True:
            while len(wavs) < train_batch_size:
                label_id, uid, fname = tmp_train_df.pop(0)
                wav = wav_read(fname)
                for i in selected_indices:
                    augmented = all_aug(wav, variations[i])
                    arr = [label_id, augmented]
                    wavs.append(arr)
            
            x_batch = []
            y_batch = []
            for x in xrange(train_batch_size):
                label_id, wav = wavs.pop(0)
                x_batch.append(process_wav_file(wav))
                y_batch.append(label_id)
            
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes=len(POSSIBLE_LABELS))
            yield x_batch, y_batch
            
            if len(tmp_train_df) == 0:
                break
            
def valid_generator(val_batch_size):
    while True:

        variations = get_variations_valid()
        
        len_var = len(variations)
        selected_indices = np.random.choice(len_var, int(np.ceil(len_var/DIV)))
        
        wavs = []
        tmp_valid_df = np.array(valid_df)
        np.random.shuffle(tmp_valid_df)
        tmp_valid_df = tmp_valid_df.tolist()
        
        while True:
            while len(wavs) < valid_batch_size:
                label_id, uid, fname = tmp_valid_df.pop(0)
                wav = wav_read(fname)
                for i in selected_indices:
                    augmented = all_aug(wav, variations[i])
                    arr = [label_id, augmented]
                    wavs.append(arr)
            
            x_batch = []
            y_batch = []
            for x in xrange(valid_batch_size):
                label_id, wav = wavs.pop(0)
                x_batch.append(process_wav_file(wav))
                y_batch.append(label_id)
            
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes=len(POSSIBLE_LABELS))
            yield x_batch, y_batch
            
            if len(tmp_valid_df) == 0:
                break
            
def test_generator(test_batch_size):
    while True:
        for start in range(0, len(test_paths), test_batch_size):
            x_batch = []
            end = min(start + test_batch_size, len(test_paths))
            this_paths = test_paths[start:end]
            for x in this_paths:
                x_batch.append(process_wav_file(x))
            x_batch = np.array(x_batch)
            yield x_batch

In [27]:
callbacks = [EarlyStopping(monitor='val_loss',
                           patience=5,
                           verbose=1,
                           min_delta=0.01,
                           mode='min'),
             ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=3,
                               verbose=1,
                               epsilon=0.01,
                               mode='min'),
             ModelCheckpoint(monitor='val_loss',
                             filepath='starter.hdf5',
                             save_best_only=True,
                             save_weights_only=True,
                             mode='min',
                            verbose=1),
             TQDMNotebookCallback()]

batch_size = 32

history = model.fit_generator(generator=train_generator(batch_size),
                              steps_per_epoch=int(np.ceil(len(train_df)*multiplier/batch_size)),
                              epochs=120,
                              verbose=2,
                              callbacks=callbacks,
                              validation_data=valid_generator(batch_size),
                              validation_steps=int(np.ceil(len(valid_df)*multiplier/batch_size)))

A Jupyter Widget

A Jupyter Widget

Epoch 1/120


KeyboardInterrupt: 

In [None]:
model.load_weights('./weights/starter.hdf5')

In [None]:
test_paths = glob(os.path.join('./data/', 'test/audio/*wav'))

In [None]:
predictions = model.predict_generator(test_generator(64), int(np.ceil(len(test_paths)/64)))

In [None]:
classes = np.argmax(predictions, axis=1)

In [None]:
# last batch will contain padding, so remove duplicates
submission = dict()
for i in range(len(test_paths)):
    fname, label = os.path.basename(test_paths[i]), id2name[classes[i]]
    submission[fname] = label

In [None]:
with open('starter_submission.csv', 'w') as fout:
    fout.write('fname,label\n')
    for fname, label in submission.items():
        fout.write('{},{}\n'.format(fname, label))