In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import librosa as ls
import IPython.display as ipd
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
import warnings
from functools import partial

# Bring your packages onto the path
import sys, os
sys.path.append('../')
from file_operations import to_pickle, read_pickle
from preprocessing import get_MFCC, split_signal, sound_pipeline, normalize_array
from preprocessing import overlay_noise, add_padding_to_sound, encode_labels, invert_encode

# Ignoring warnings
warnings.filterwarnings('ignore')

# Ensuring autoreload of changes in submodules
%load_ext autoreload
%autoreload 2

### Loading prepreocessed MFCC data

In [2]:
data = read_pickle('sample_rate2205_num_mfcc13_n_fft2048_hop_length512.pickle')

### One hot encoding command labels

In [3]:
commands = ['go', 'left', 'right', 'stop']
label_encoder = LabelEncoder()
label_encoder.fit(commands)

X = np.array(data['X'])[:,:,:,np.newaxis]
y = encode_labels(data['labels'], label_encoder)

label_encoder.classes_

array(['go', 'left', 'right', 'stop'], dtype='<U5')

In [4]:
X.shape

(104192, 13, 44, 1)

In [5]:
idd = np.random.randint(0,len(data['sounds']))
print(y[idd], invert_encode(y[idd], label_encoder))
ipd.Audio(data['sounds'][idd],rate=22050)


[0. 0. 1. 0.] ['right']


### Splitting the data for training and testing data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5,shuffle=True)

### Configuring CNN model training and fitting it to the data

In [10]:
from cnn_clf_model import get_model, train
import tensorflow as tf

epochs=9 
batch_size=64
patience=0
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor="accuracy", min_delta=0.001, patience=patience)
mcp_save = tf.keras.callbacks.ModelCheckpoint('mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')

model = get_model(X_train.shape[1:], num_outputs=4, learning_rate=0.0001)
# train model
history = model.fit(X_train,
                    y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test),
                    callbacks=[earlystop_callback,mcp_save])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 11, 42, 64)        640       
_________________________________________________________________
batch_normalization (BatchNo (None, 11, 42, 64)        256       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 6, 21, 64)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 4, 19, 32)         18464     
_________________________________________________________________
batch_normalization_1 (Batch (None, 4, 19, 32)         128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 10, 32)         0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1, 9, 32)          4

In [94]:
partial_mffc = partial(get_MFCC,sample_rate=22050,num_mfcc=13, n_fft=2048, hop_length=512, scaled = False)

In [129]:
# sound = sound[:sr]
sound = sound[sr*83:]


In [148]:
85000+sr

107050

In [151]:
s = 127050
ipd.Audio(sound[s:s+sr],rate=sr)

In [16]:
sound, sr = ls.load('test.wav')
stop = sound[6000:sr+6000]
left = sound[2*sr+6000:3*sr+6000]
right = sound[85000:85000+sr]
go = sound[127050:127050+sr]

In [11]:
from SoundCommandClf import SoundCommandClf

In [24]:
clf = SoundCommandClf()


Model model.hdf5 has been loaded successfully!


In [27]:
%%timeit
clf.classify_rec(stop)

18.5 ms ± 179 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


NotFittedError: This LabelEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [112]:
sound, sr = ls.load('right.wav')
sound = sound[:sr]
ipd.Audio(sound,rate=sr)

22050

In [154]:
# padded = add_padding_to_sound(sound)
mfcc = partial_mffc(go)
mfcc = normalize_array(mfcc)
# mfcc = mfcc[np.newaxis,:,:,np.newaxis]
invert_encode(model.predict(mfcc[np.newaxis,:,:,np.newaxis]))

array(['go'], dtype='<U5')

array([[1.9546379e-21, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00]],
      dtype=float32)

array(['go'], dtype='<U5')