In [29]:
import glob
import librosa
import numpy as np
from hmmlearn import hmm
from scipy.io import wavfile
from collections import defaultdict
from python_speech_features import mfcc

In [30]:
db = defaultdict(list)

In [31]:
for path in glob.glob('audio/*'):
    label = path.split('\\')[-1]
    for wav_path in glob.glob(path + '/*'):
        sr, signal = wavfile.read(wav_path)
        db[label].append(signal.astype('float'))

In [32]:
sr

8000

In [33]:
glob.glob('audio/*')

['audio\\apple',
 'audio\\banana',
 'audio\\kiwi',
 'audio\\lime',
 'audio\\orange',
 'audio\\peach',
 'audio\\pineapple']

In [34]:
# get features
# 1. pre-emphasis with 0.97
# 2. framing with each frame size 25ms, step size 10ms
# 3. apply Hamming window function
# 4. FFT
# 5. Mel frequency cepstrum
features = defaultdict(list)
for label in db.keys():
    features[label] = list(map(lambda x: mfcc(x, sr, winfunc=np.hamming), db[label]))

In [37]:
train = dict()
test = dict()
test_size = int(0.2 * len(features['apple']))
for label in features.keys():
    train[label] = np.array([])
    test[label] = list()
    for feature in features[label][:-test_size]:
        if train[label].shape[0] == 0:
            train[label] = feature
        else:
            train[label] = np.append(train[label], feature, axis=0)
    for feature in features[label][-test_size:]:
        test[label].append(feature)

In [38]:
train['apple'].shape

(474, 13)

In [39]:
models = dict()
for label in features.keys():
    models[label] = hmm.GaussianHMM(n_components=5, n_iter=100).fit(train[label])

In [40]:
models

{'apple': GaussianHMM(n_components=5, n_iter=100),
 'banana': GaussianHMM(n_components=5, n_iter=100),
 'kiwi': GaussianHMM(n_components=5, n_iter=100),
 'lime': GaussianHMM(n_components=5, n_iter=100),
 'orange': GaussianHMM(n_components=5, n_iter=100),
 'peach': GaussianHMM(n_components=5, n_iter=100),
 'pineapple': GaussianHMM(n_components=5, n_iter=100)}

In [41]:
def predict_speech(sample):
    def get_ll(model):
        ll, hidden = model.decode(sample)
        return ll
    predicted_ll = [get_ll(model) for label, model in models.items()]
    labels = list(models.keys())
    idx = np.argmax(predicted_ll)
    return labels[idx]

In [42]:
len_data = 0
right_predicted = 0
for label, data in test.items():
    for datum in data:
        pred = predict_speech(datum)
        len_data += 1
        right_predicted += pred == label
#         print("Actual: {}, Predicted: {}".format(label, pred))
print("Accuracy: %.2f" % (right_predicted / len_data))

Accuracy: 1.00
