In [1]:
%%time
import os
import numpy as np
from scipy.fftpack import fft
from scipy.io import wavfile
from scipy import signal
from glob import glob
import re
import pandas as pd
import gc
from scipy.io import wavfile

from keras import optimizers, losses, activations, models
from keras.layers import Convolution2D, Dense, Input, Flatten, Dropout, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
import keras

# from keras import backend as K
# K.set_session(K.tf.Session(config=K.tf.ConfigProto(intra_op_parallelism_threads=24, inter_op_parallelism_threads=24)))

# import matplotlib.pyplot as plt
# import IPython.display as ipd

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
from time import time, ctime

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.models import load_model
from time import time, ctime

L = 16000
legal_labels = 'yes no up down left right on off stop go silence unknown'.split()

#src folders
root_path = r'.'
out_path = r'.'
model_path = r'.'
train_data_path = os.path.join(root_path, 'data', 'train', 'audio')
test_data_path = os.path.join(root_path, 'data', 'test', 'audio')

Using TensorFlow backend.


CPU times: user 14.3 s, sys: 1.23 s, total: 15.5 s
Wall time: 26.7 s


In [4]:
def pad_audio(samples):
    if len(samples) >= L: return samples
    else: return np.pad(samples, pad_width=(L - len(samples), 0), mode='constant', constant_values=(0, 0))

def chop_audio(samples, L=16000, num=20):
    for i in range(num):
        beg = np.random.randint(0, len(samples) - L)
        yield samples[beg: beg + L]

def label_transform(labels):
    nlabels = []
    for label in labels:
        if label == '_background_noise_':
            nlabels.append('silence')
#         elif label not in legal_labels:
#             nlabels.append('unknown')
        else:
            nlabels.append(label)
    return pd.get_dummies(pd.Series(nlabels))

def custom_fft(y, fs):
    T = 1.0 / fs
    N = y.shape[0]
    yf = fft(y)
    xf = np.linspace(0.0, 1.0/(2.0*T), N//2)
    # FFT is simmetrical, so we take just the first half
    # FFT is also complex, to we take just the real part (abs)
    vals = 2.0/N * np.abs(yf[0:N//2])
    return xf, vals

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def list_wavs_fname(dirpath, ext='wav'):
    print(dirpath)
    fpaths = glob(os.path.join(dirpath, r'*/*' + ext))
    pat = r'.+/(\w+)/\w+\.' + ext + '$'
    labels = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            labels.append(r.group(1))
    pat = r'.+/(\w+\.' + ext + ')$'
    fnames = []
    for fpath in fpaths:
        r = re.match(pat, fpath)
        if r:
            fnames.append(r.group(1))
    return labels, fnames

### skip here

In [5]:
%%time
labels, fnames = list_wavs_fname(train_data_path)

new_sample_rate = 16000
y_train = []
x_train = []

n=0
for label, fname in zip(labels, fnames):
    n+=1
    if n%1000==0:
        print(int(n/1000), end='/{} '.format(int(len(labels)/1000)))
    sample_rate, samples = wavfile.read(os.path.join(train_data_path, label, fname))
    
    samples = pad_audio(samples)
    
    if len(samples) > 16000:
        n_samples = chop_audio(samples)
    else: n_samples = [samples]
        
    for samples in n_samples:
        resampled = signal.resample(samples, int(new_sample_rate / sample_rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        
        y_train.append(label)
        x_train.append(specgram)
        
x_train = np.array(x_train)
x_train = x_train.reshape(tuple(list(x_train.shape) + [1]))
y_train = label_transform(y_train)

label_index = y_train.columns.values

y_train = y_train.values
y_train = np.array(y_train)
del labels, fnames
gc.collect()

X = x_train.copy()
Y = y_train.copy()
print (X.shape, Y.shape, type(X), type(Y))

./data/train/audio
1/64 2/64 3/64 4/64 5/64 6/64 7/64 8/64 9/64 10/64 11/64 



12/64 13/64 14/64 15/64 16/64 17/64 18/64 19/64 20/64 21/64 22/64 23/64 24/64 25/64 26/64 27/64 28/64 29/64 30/64 31/64 32/64 33/64 34/64 35/64 36/64 37/64 38/64 39/64 40/64 41/64 42/64 43/64 44/64 45/64 46/64 47/64 48/64 49/64 50/64 51/64 52/64 53/64 54/64 55/64 56/64 57/64 58/64 59/64 60/64 61/64 62/64 63/64 64/64 (64841, 99, 161, 1) (64841, 31) <class 'numpy.ndarray'> <class 'numpy.ndarray'>
CPU times: user 1min 49s, sys: 7.31 s, total: 1min 56s
Wall time: 17min 19s


In [13]:
label_index = np.array(list(['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go',
       'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on',
       'one', 'right', 'seven', 'sheila', 'silence', 'six', 'stop',
       'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero'])).astype('object')

In [16]:
%%time
np.save('X_31label_16k.npy', X)
np.save('Y_31label_16k.npy', Y)

CPU times: user 0 ns, sys: 877 ms, total: 877 ms
Wall time: 1.53 s


### mid start from here

In [17]:
%%time
X = np.load('X_31label_16k.npy')
Y = np.load('Y_31label_16k.npy')

CPU times: user 0 ns, sys: 1.32 s, total: 1.32 s
Wall time: 1.31 s


In [27]:
input_shape = (99, 161, 1)
nclass = 31
inp = Input(shape=input_shape)
norm_inp = BatchNormalization()(inp)
img_1 = Convolution2D(12, kernel_size=2, activation=activations.relu)(norm_inp)
img_1 = Convolution2D(12, kernel_size=2, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(24, kernel_size=3, activation=activations.relu)(img_1)
img_1 = Convolution2D(24, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Convolution2D(32, kernel_size=3, activation=activations.relu)(img_1)
img_1 = MaxPooling2D(pool_size=(3, 3))(img_1)
img_1 = Dropout(rate=0.2)(img_1)
img_1 = Flatten()(img_1)

dense_1 = BatchNormalization()(Dense(168, activation=activations.relu)(img_1))
dense_1 = BatchNormalization()(Dense(168, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

model = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

model.compile(optimizer=opt, loss=losses.binary_crossentropy, metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 99, 161, 1)        0         
_________________________________________________________________
batch_normalization_28 (Batc (None, 99, 161, 1)        4         
_________________________________________________________________
conv2d_46 (Conv2D)           (None, 98, 160, 12)       60        
_________________________________________________________________
conv2d_47 (Conv2D)           (None, 97, 159, 12)       588       
_________________________________________________________________
max_pooling2d_28 (MaxPooling (None, 32, 53, 12)        0         
_________________________________________________________________
dropout_28 (Dropout)         (None, 32, 53, 12)        0         
_________________________________________________________________
conv2d_48 (Conv2D)           (None, 30, 51, 24)        2616      
__________

https://machinelearningmastery.com/check-point-deep-learning-models-keras/

In [28]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)
# checkpoint
filepath="models/F9W-31L-{epoch:02d}-{val_acc:.5f}_.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint, early_stopping]

In [29]:
S = lambda *x: [print(i.shape, type(i)) for i in x]
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.1, random_state=2018)
S(x_train, y_train, x_valid, y_valid)

(58356, 99, 161, 1) <class 'numpy.ndarray'>
(58356, 31) <class 'numpy.ndarray'>
(6485, 99, 161, 1) <class 'numpy.ndarray'>
(6485, 31) <class 'numpy.ndarray'>


[None, None, None, None]

In [30]:
%%time
# Fit the model
model.fit(x_train, y_train, batch_size=64, validation_data=(x_valid, y_valid), 
          epochs=100, shuffle=True, verbose=1, callbacks=callbacks_list)

Train on 58356 samples, validate on 6485 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100


Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100


CPU times: user 53min 35s, sys: 10min 50s, total: 1h 4min 25s
Wall time: 1h 14min 25s


<keras.callbacks.History at 0x2abc775d5b00>

In [33]:
pd.DataFrame(y_valid).sum()

0     160
1     193
2     174
3     174
4     243
5     261
6     238
7     225
8     253
9     172
10    178
11    240
12    191
13    241
14    233
15    246
16    207
17    233
18    232
19    224
20    156
21     16
22    205
23    258
24    257
25    189
26    243
27    215
28    171
29    228
30    229
dtype: int64

In [29]:
# model.save(os.path.join(model_path, 'cnn.model'))

In [39]:
!ls models -t|grep 9W-31

F9W-31L-58-0.99562_.hdf5
F9W-31L-56-0.99559_.hdf5
F9W-31L-51-0.99557_.hdf5
F9W-31L-47-0.99557_.hdf5
F9W-31L-46-0.99543_.hdf5
F9W-31L-41-0.99542_.hdf5
F9W-31L-40-0.99538_.hdf5
F9W-31L-34-0.99535_.hdf5
F9W-31L-28-0.99515_.hdf5
F9W-31L-25-0.99495_.hdf5
F9W-31L-22-0.99493_.hdf5
F9W-31L-20-0.99488_.hdf5
F9W-31L-18-0.99461_.hdf5
F9W-31L-17-0.99449_.hdf5
F9W-31L-15-0.99442_.hdf5
F9W-31L-14-0.99432_.hdf5
F9W-31L-13-0.99411_.hdf5
F9W-31L-12-0.99409_.hdf5
F9W-31L-09-0.99381_.hdf5
F9W-31L-08-0.99336_.hdf5
F9W-31L-07-0.99313_.hdf5
F9W-31L-06-0.99301_.hdf5
F9W-31L-05-0.99265_.hdf5
F9W-31L-04-0.99230_.hdf5
F9W-31L-03-0.99156_.hdf5
F9W-31L-02-0.98928_.hdf5
F9W-31L-01-0.98678_.hdf5
F9W-31L-00-0.97945_.hdf5
F9W-31-0.98714_.hdf5


### predict

In [40]:
Mname = 'models/F9W-31L-58-0.99562_.hdf5'

# returns a compiled model
# identical to the previous one
model = load_model(Mname)


In [41]:
def test_data_generator(batch=16):
    fpaths = glob(os.path.join(test_data_path, '*wav'))
    i = 0
    for path in fpaths:
        if i == 0:
            imgs = []
            fnames = []
        i += 1
        rate, samples = wavfile.read(path)
        samples = pad_audio(samples)
        resampled = signal.resample(samples, int(new_sample_rate / rate * samples.shape[0]))
        _, _, specgram = log_specgram(resampled, sample_rate=new_sample_rate)
        imgs.append(specgram)
        fnames.append(path.split('\\')[-1])
        if i == batch:
            i = 0
            imgs = np.array(imgs)
            imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
            yield fnames, imgs
    if i < batch:
        imgs = np.array(imgs)
        imgs = imgs.reshape(tuple(list(imgs.shape) + [1]))
        yield fnames, imgs
    raise StopIteration()

In [42]:
%%time
batch = 64
start = time()


#exit() #delete this
#del x_train, y_train
gc.collect()

index = []
results = []
N=0
for fnames, imgs in test_data_generator(batch=batch):
    N+=1
    if N%10==0:
        print ('used: {:.2f} s'.format(time()-start), end=' :{} '.format(N*batch))
        start = time()
    predicts = model.predict(imgs)
    predicts = np.argmax(predicts, axis=1)
    predicts = [label_index[p] for p in predicts]
    index.extend(fnames)
    results.extend(predicts)

df = pd.DataFrame(columns=['fname', 'label'])
df['fname'] = index
df['label'] = results

used: 14.74 s :640 used: 12.26 s :1280 used: 13.66 s :1920 used: 11.52 s :2560 used: 11.85 s :3200 used: 12.34 s :3840 used: 12.26 s :4480 used: 12.07 s :5120 used: 12.32 s :5760 used: 13.20 s :6400 used: 13.33 s :7040 used: 13.78 s :7680 used: 12.97 s :8320 used: 12.85 s :8960 used: 13.40 s :9600 used: 12.96 s :10240 used: 13.51 s :10880 used: 13.03 s :11520 used: 12.84 s :12160 used: 12.30 s :12800 used: 12.81 s :13440 used: 12.93 s :14080 used: 12.59 s :14720 used: 12.91 s :15360 used: 13.33 s :16000 used: 12.91 s :16640 used: 14.03 s :17280 used: 12.84 s :17920 used: 13.74 s :18560 used: 12.99 s :19200 used: 13.05 s :19840 used: 13.08 s :20480 used: 12.69 s :21120 used: 13.00 s :21760 used: 13.77 s :22400 used: 12.74 s :23040 used: 12.87 s :23680 used: 14.10 s :24320 used: 13.48 s :24960 used: 13.50 s :25600 used: 12.80 s :26240 used: 12.98 s :26880 used: 13.16 s :27520 used: 12.85 s :28160 used: 13.38 s :28800 used: 11.92 s :29440 used: 10.41 s :30080 used: 10.32 s :30720 used: 11

  if sys.path[0] == '':


In [45]:
df['label'].value_counts()

on         18143
four       10379
up          9081
off         6945
no          6919
zero        6885
one         6824
five        6727
yes         6688
eight       6275
nine        6241
left        6142
two         6088
six         5969
stop        5645
seven       5414
right       4740
three       4691
go          4573
down        4522
bed         2717
silence     2714
sheila      2233
marvin      2063
wow         1984
dog         1659
cat         1531
bird        1477
happy       1191
house       1040
tree        1038
Name: label, dtype: int64

In [46]:
legal_label = np.array(['down', 'go', 'left', 'no', 'off', 'on', 'right',
                        'silence', 'stop', 'up', 'yes']).astype('object')

In [48]:
df['label'].apply(lambda x: 'unknow' if x not in legal_label else x).value_counts()

unknow     82426
on         18143
up          9081
off         6945
no          6919
yes         6688
left        6142
stop        5645
right       4740
go          4573
down        4522
silence     2714
Name: label, dtype: int64

In [49]:
df['label'] = df['label'].apply(lambda x: 'unknow' if x not in legal_label else x)

In [50]:
df.shape

(158538, 2)

In [52]:
df['fname'] = df['fname'].apply(lambda x:x.split('audio/')[-1])
df.to_csv(os.path.join(out_path, Mname.split('/')[-1]+'_submission.csv'), index=False)