In [1]:
%%time
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

# from keras import backend as K
# K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=24, inter_op_parallelism_threads=24)))

# import matplotlib.pyplot as plt
# import IPython.display as ipd

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
from time import time, ctime

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
from time import time, ctime

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'.'
out_path = r'.'
model_path = r'.'
train_data_path = os.path.join(root_path, 'data', 'train', 'audio')
test_data_path = os.path.join(root_path, 'data', 'test', 'audio')

Using TensorFlow backend.


CPU times: user 13.1 s, sys: 1.23 s, total: 14.3 s
Wall time: 26.3 s


In [9]:
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
        elif label not in legal_labels:
            nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

### skip here

In [15]:
%%time
labels, fnames = list_wavs_fname(train_data_path)

new_sample_rate = 16000
y_train = []
x_train = []

n=0
for label, fname in zip(labels, fnames):
    n+=1
    if n%1000==0:
        print(int(n/1000), end='/{} '.format(int(len(labels)/1000)))
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    
    samples = pad_audio(samples)
    
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
        
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        
        y_train.append(label)
        x_train.append(specgram)
        
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)
label_index = y_train.columns.values
y_train = y_train.values
y_train = np.array(y_train)
del labels, fnames
gc.collect()

X = x_train.copy()
Y = y_train.copy()
print (X.shape, Y.shape, type(X), type(Y))

./data/train/audio
1/64 2/64 3/64 4/64 5/64 6/64 7/64 8/64 9/64 10/64 11/64 



12/64 13/64 14/64 15/64 16/64 17/64 18/64 19/64 20/64 21/64 22/64 23/64 24/64 25/64 26/64 27/64 28/64 29/64 30/64 31/64 32/64 33/64 34/64 35/64 36/64 37/64 38/64 39/64 40/64 41/64 42/64 43/64 44/64 45/64 46/64 47/64 48/64 49/64 50/64 51/64 52/64 53/64 54/64 55/64 56/64 57/64 58/64 59/64 60/64 61/64 62/64 63/64 64/64 (64841, 99, 161, 1) (64841, 12) <class 'numpy.ndarray'> <class 'numpy.ndarray'>
CPU times: user 1min 51s, sys: 9.46 s, total: 2min 1s
Wall time: 18min 4s


In [56]:
pd.DataFrame(Y).sum()

0      2359
1      2372
2      2353
3      2375
4      2357
5      2367
6      2367
7       120
8      2380
9     41039
10     2375
11     2377
dtype: int64

In [52]:
label_index

array(['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop',
       'unknown', 'up', 'yes'], dtype=object)

In [24]:
b = label_index.copy()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True], dtype=bool)

In [57]:
%%time
np.save('X_16k.npy', X)
np.save('Y_16k.npy', Y)

CPU times: user 5 ms, sys: 912 ms, total: 917 ms
Wall time: 1.03 s


### mid start from here

In [36]:
%%time
X = np.load('X_16k.npy')
Y = np.load('Y_16k.npy')

CPU times: user 0 ns, sys: 1.47 s, total: 1.47 s
Wall time: 2.03 s


In [41]:
input_shape = (99, 161, 1)
nclass = 12
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(8, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(4, 4))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(24, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(24, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(48, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(256, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 99, 161, 1)        0         
_________________________________________________________________
batch_normalization_13 (Batc (None, 99, 161, 1)        4         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 98, 160, 8)        40        
_________________________________________________________________
conv2d_22 (Conv2D)           (None, 97, 159, 8)        264       
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 24, 39, 8)         0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 24, 39, 8)         0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 22, 37, 24)        1752      
__________

https://machinelearningmastery.com/check-point-deep-learning-models-keras/

In [43]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
# checkpoint
filepath="models/F9W-{epoch:02d}-{val_acc:.5f}_.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint, early_stopping]

In [44]:
S = lambda *x: [print(i.shape, type(i)) for i in x]
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.1, random_state=2018)
S(x_train, y_train, x_valid, y_valid)

(58356, 99, 161, 1) <class 'numpy.ndarray'>
(58356, 12) <class 'numpy.ndarray'>
(6485, 99, 161, 1) <class 'numpy.ndarray'>
(6485, 12) <class 'numpy.ndarray'>


[None, None, None, None]

In [45]:
%%time
# Fit the model
model.fit(x_train, y_train, batch_size=64, validation_data=(x_valid, y_valid), 
          epochs=100, shuffle=True, verbose=1, callbacks=callbacks_list)

Train on 58356 samples, validate on 6485 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100


Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
CPU times: user 48min 36s, sys: 10min 26s, total: 59min 3s
Wall time: 1h 4min 47s


<keras.callbacks.History at 0x2b170893ff28>

In [29]:
# model.save(os.path.join(model_path, 'cnn.model'))

In [46]:
!ls models

cnn.model	      F9W-61-0.98840_.hdf5
F9W-00-0.94945_.hdf5  weights-improvement-00-0.96690_.hdf5
F9W-01-0.96777_.hdf5  weights-improvement-00-0.97.hdf5
F9W-02-0.97232_.hdf5  weights-improvement-00-0.99.hdf5
F9W-03-0.97750_.hdf5  weights-improvement-01-0.97809_.hdf5
F9W-04-0.97899_.hdf5  weights-improvement-01-0.99.hdf5
F9W-05-0.98033_.hdf5  weights-improvement-02-0.98144_.hdf5
F9W-07-0.98098_.hdf5  weights-improvement-02-0.99.hdf5
F9W-08-0.98219_.hdf5  weights-improvement-03-0.98531_.hdf5
F9W-09-0.98300_.hdf5  weights-improvement-04-0.98585_.hdf5
F9W-11-0.98310_.hdf5  weights-improvement-04-0.99.hdf5
F9W-12-0.98404_.hdf5  weights-improvement-05-0.98909_.hdf5
F9W-13-0.98493_.hdf5  weights-improvement-05-0.99.hdf5
F9W-16-0.98530_.hdf5  weights-improvement-06-0.99.hdf5
F9W-18-0.98648_.hdf5  weights-improvement-08-0.98913_.hdf5
F9W-22-0.98666_.hdf5  weights-improvement-08-0.99.hdf5
F9W-24-0.98680_.hdf5  weights-improvement-09-0.99013_.hdf5
F9W-25-0.98688_.hdf5  weights-impr

### predict

In [47]:
Mname = 'models/F9W-61-0.98840_.hdf5'

# returns a compiled model
# identical to the previous one
model = load_model(Mname)

label_index = np.array(['down', 'go', 'left', 'no', 'off', 'on', 'right',
                        'silence', 'stop', 'unknown', 'up', 'yes']).astype('object')

In [48]:
def test_data_generator(batch=16):
    fpaths = glob(os.path.join(test_data_path, '*wav'))
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('\\')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
        yield fnames, imgs
    raise StopIteration()

In [49]:
%%time
batch = 64
start = time()


#exit() #delete this
#del x_train, y_train
gc.collect()

index = []
results = []
N=0
for fnames, imgs in test_data_generator(batch=batch):
    N+=1
    if N%10==0:
        print ('used: {:.2f} s'.format(time()-start), end=' :{} '.format(N*batch))
        start = time()
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results


df['fname'] = df['fname'].apply(lambda x:x.split('audio/')[-1])
df.to_csv(os.path.join(out_path, Mname.split('/')[-1]+'_submission.csv'), index=False)

used: 13.73 s :640 used: 13.25 s :1280 used: 12.88 s :1920 used: 13.40 s :2560 used: 12.69 s :3200 used: 11.78 s :3840 used: 12.75 s :4480 used: 31.98 s :5120 used: 20.51 s :5760 used: 14.33 s :6400 used: 18.25 s :7040 used: 13.73 s :7680 used: 12.88 s :8320 used: 12.99 s :8960 used: 13.00 s :9600 used: 12.50 s :10240 used: 11.69 s :10880 used: 13.10 s :11520 used: 13.14 s :12160 used: 12.76 s :12800 used: 13.42 s :13440 used: 14.64 s :14080 used: 13.36 s :14720 used: 11.46 s :15360 used: 12.71 s :16000 used: 13.51 s :16640 used: 13.35 s :17280 used: 13.24 s :17920 used: 12.90 s :18560 used: 12.72 s :19200 used: 11.89 s :19840 used: 12.87 s :20480 used: 13.08 s :21120 used: 13.36 s :21760 used: 12.83 s :22400 used: 12.72 s :23040 used: 12.59 s :23680 used: 11.65 s :24320 used: 12.89 s :24960 used: 12.76 s :25600 used: 12.93 s :26240 used: 12.85 s :26880 used: 13.63 s :27520 used: 11.63 s :28160 used: 12.00 s :28800 used: 12.90 s :29440 used: 13.33 s :30080 used: 13.10 s :30720 used: 13

  if sys.path[0] == '':


CPU times: user 4min 51s, sys: 20.9 s, total: 5min 12s
Wall time: 56min 54s


In [50]:
# df.to_csv(os.path.join(out_path, Mname.split('/')[-1]+'_submission.csv'), index=False)