In [1]:
import os
class_names = ['dup', 'giu', 'nha', 'phai', 'trai']
states = [15, 12, 12, 15, 15]

length = 0
for d in class_names:
    length += len(os.listdir("data/" + d))
print(length)
!pip install librosa
!pip install hmmlearn

193


In [2]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix


In [3]:
all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join("data", cname, i) for i in os.listdir(os.path.join('data', cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for i in range(len(file_paths))]

In [4]:
from sklearn.model_selection import train_test_split

X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}
for cname in class_names:
    x_train, x_test, _, y_test = train_test_split(
        all_data[cname], all_labels[cname], 
        test_size = 0.2, 
        random_state=42
    )
    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

In [5]:
for cname in class_names:
    print(cname,len(X['train'][cname]), len(X['test'][cname]), len(y['test'][cname]))

dup 24 7 7
giu 35 9 9
nha 29 8 8
phai 31 8 8
trai 25 7 7


In [6]:
import hmmlearn.hmm as hmm

model = {}
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0
    
    #trans matrix
    print(cname)
    print(trans_matrix) 

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx], 
        verbose=True, 
        n_iter=300, 
        startprob_prior=start_prob, 
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )

    model[cname].fit(X=np.vstack(X['train'][cname]), lengths=[x.shape[0] for x in X['train'][cname]])

dup
[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -310311.6093             +nan
         2     -262620.3472      +47691.2621
         3     -231483.5173      +31136.8299
         4     -220804.3797      +10679.1375
         5     -213286.7149       +7517.6648
         6     -203550.1766       +9736.5383
         7     -191719.4265      +11830.7501
         8     -173418.0094      +18301.4171
         9     -159651.4324      +13766.5769
        10     -159068.0342        +583.3982
        11     -159002.8854         +65.1488
        12     -158984.0749         +18.8106
        13     -158951.3331         +32.7418
        14     -158915.9227         +35.4104
        15     -158903.4808         +12.4419
        16     -158896.6651          +6.8157
        17     -158892.2660          +4.3991
        18     -158885.1355          +7.1305
        19     -158879.8794          +5.2561
        20     -158875.3590          +4.5204
        21     -158872.6597          +2.6992
        22     -158869.4488          +3.2110
        23

giu
[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -494285.6849             +nan
         2     -435726.5564      +58559.1285
         3     -395456.0147      +40270.5417
         4     -358402.0799      +37053.9347
         5     -336638.5746      +21763.5053
         6     -306626.4740      +30012.1006
         7     -269418.9672      +37207.5068
         8     -257436.0053      +11982.9619
         9     -256128.2209       +1307.7845
        10     -255325.8912        +802.3297
        11     -255005.2865        +320.6047
        12     -254878.9013        +126.3851
        13     -254796.7231         +82.1783
        14     -254720.8057         +75.9174
        15     -254641.6324         +79.1733
        16     -254568.3451         +73.2873
        17     -254491.1599         +77.1852
        18     -254415.9243         +75.2356
        19     -254361.9067         +54.0175
        20     -254335.9741         +25.9326
        21     -254324.5772         +11.3969
        22     -254318.6246          +5.9526
        23

nha
[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -363218.9882             +nan
         2     -322852.9407      +40366.0475
         3     -301295.1138      +21557.8268
         4     -291524.9437       +9770.1701
         5     -282703.9938       +8820.9499
         6     -269235.4787      +13468.5152
         7     -234172.0181      +35063.4605
         8     -219773.6432      +14398.3749
         9     -219655.6082        +118.0351
        10     -219538.6903        +116.9178
        11     -219418.1535        +120.5368
        12     -219300.3444        +117.8092
        13     -219177.6414        +122.7029
        14     -219051.9529        +125.6885
        15     -218924.5903        +127.3626
        16     -218773.7522        +150.8381
        17     -218677.4857         +96.2665
        18     -218605.7750         +71.7107
        19     -218539.0371         +66.7379
        20     -218483.1535         +55.8836
        21     -218436.6576         +46.4959
        22     -218399.8989         +36.7587
        23

phai
[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -409405.0484             +nan
         2     -356201.0836      +53203.9648
         3     -319758.3169      +36442.7667
         4     -281769.5115      +37988.8054
         5     -234868.7486      +46900.7629
         6     -174882.6601      +59986.0885
         7     -174543.1156        +339.5445
         8     -174238.2990        +304.8166
         9     -173809.3846        +428.9144
        10     -173567.8103        +241.5743
        11     -173383.2873        +184.5230
        12     -173149.3083        +233.9790
        13     -173054.8108         +94.4975
        14     -173023.8694         +30.9414
        15     -173007.2134         +16.6561
        16     -172997.6890          +9.5244
        17     -172992.8640          +4.8250
        18     -172991.0745          +1.7894
        19     -172988.5023          +2.5723
        20     -172985.9892          +2.5131
        21     -172983.3365          +2.6527
        22     -172981.4115          +1.9250
        23

trai
[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -330065.5547             +nan
         2     -275522.1498      +54543.4049
         3     -230650.4058      +44871.7440
         4     -209080.8199      +21569.5858
         5     -176041.6835      +33039.1364
         6     -100060.0053      +75981.6783
         7      -97974.1161       +2085.8891
         8      -97787.5615        +186.5547
         9      -97573.3681        +214.1933
        10      -97392.9038        +180.4644
        11      -97159.6429        +233.2609
        12      -96921.5911        +238.0518
        13      -96787.2067        +134.3843
        14      -96713.9395         +73.2673
        15      -96697.8257         +16.1138
        16      -96681.0275         +16.7982
        17      -96663.5633         +17.4642
        18      -96652.6344         +10.9289
        19      -96624.3424         +28.2920
        20      -96602.9509         +21.3914
        21      -96586.9163         +16.0347
        22      -96581.6845          +5.2317
        23

In [7]:
import pickle

# save model
for cname in class_names:
    name = f'models_train\model_{cname}.pkl'
    with open(name, 'wb') as file: 
        pickle.dump(model[cname], file)

In [8]:
import pickle, os
import numpy as np

from sklearn.metrics import classification_report

In [9]:
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
    print((np.array(y_true) == np.array(y_pred)).sum()/len(y_true))
print(y_true)
print(y_pred)


1.0
0.9375
0.8333333333333334
0.875
0.8205128205128205
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4]
[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 4, 3]


In [10]:
report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

         dup       1.00      1.00      1.00         7
         giu       0.89      0.89      0.89         9
         nha       0.83      0.62      0.71         8
        phai       0.62      1.00      0.76         8
        trai       1.00      0.57      0.73         7

    accuracy                           0.82        39
   macro avg       0.87      0.82      0.82        39
weighted avg       0.86      0.82      0.82        39



In [11]:
#loadmodels
import pickle

model_train = {}
for key in class_names:
    name = f"models\model_{key}.pkl"
    with open(name, 'rb') as file:
        model_train[key] = pickle.load(file)

In [12]:
print(model_train)

{'dup': GaussianHMM(init_params='mc', n_components=3, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. ],
       [0. , 0.5, 0.5],
       [0. , 0. , 1. ]]),
            verbose=True), 'giu': GaussianHMM(init_params='mc', n_components=3, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. ],
       [0. , 0.5, 0.5],
       [0. , 0. , 1. ]]),
            verbose=True), 'nha': GaussianHMM(init_params='mc', n_components=6, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 0. , 0. ],
       [0. , 0. , 0. , 0.5, 0.5, 0. ],
       [0. , 0. , 0. , 0. , 0.5, 0.5],
       [0. , 0. , 0. , 0. , 0. , 1. ]]),
            verbose=True), 'phai': GaussianHMM(init_params='mc', n_componen

In [22]:
import os
import numpy as np
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from utils.estnoise_ms import * 
import math
import pyaudio
import wave
from array import array

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 1
FILE_NAME="RECORDING.wav"

def _calculate_frequencies(audio_data,sr):
        data_freq = np.fft.fftfreq(len(audio_data),1.0/sr)
        data_freq = data_freq[1:]
        return data_freq

def _calculate_amplitude(audio_data):
        data_ampl = np.abs(np.fft.rfft(audio_data,axis=0))
        data_ampl = data_ampl[1:]
        return data_ampl

def _calculate_energy(data):
        data_amplitude = _calculate_amplitude(data)
        data_energy = data_amplitude ** 2
        return data_energy

def _connect_energy_with_frequencies(data_freq, data_energy):
        energy_freq = {}
        for (i, freq) in enumerate(data_freq):
            if abs(freq) not in energy_freq:
                energy_freq[abs(freq)] = data_energy[i] * 2
        return energy_freq

def _calculate_normalized_energy(data,sr):
        data_freq = _calculate_frequencies(data,sr)
        data_energy = _calculate_energy(data)
        #data_energy = self._znormalize_energy(data_energy) #znorm brings worse results
        energy_freq = _connect_energy_with_frequencies(data_freq, data_energy)
        return energy_freq

def _sum_energy_in_band(energy_frequencies, start_band, end_band):
        sum_energy = 0
        for f in energy_frequencies.keys():
            if start_band<f<end_band:
               sum_energy += energy_frequencies[f]
        return sum_energy


def count_THESHOLD():
    audio = pyaudio.PyAudio()

    stream = audio.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)
    print ("recording...")        
    frames = array('h')
    
    for i in range(0, int(RATE/CHUNK*1)):
              data = stream.read(CHUNK)
              data_chunk = array('h', data)
              frames.extend(data_chunk)
             
    stream.stop_stream()
    stream.close()
    audio.terminate()

    print("Done")

    energy_freq = _calculate_normalized_energy(frames, RATE) 
    
    
    sum_full_energy = sum(energy_freq.values()) / 42 
    
    
    return sum_full_energy

def record(THESHOLD):
    
    audio = pyaudio.PyAudio()

    stream = audio.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)
    print ("recording...")        
    frames = array('h')
    i = 0

    while True:
        data = stream.read(CHUNK)
        data_chunk = array('h', data)
        frames.extend(data_chunk)
        i += 1
        if i == 4:
            energy_freq = _calculate_normalized_energy(frames, RATE) 
            sum_full_energy = sum(energy_freq.values()) / 4
            if sum_full_energy / THESHOLD > 1.2:
                print(sum_full_energy)
                continue
            else:
                i = 0
                frames = array('h')
        if i == 86:
            break    

    stream.stop_stream()
    stream.close()
    audio.terminate()
     
    wavfile=wave.open(FILE_NAME,'wb')
    wavfile.setnchannels(CHANNELS)
    wavfile.setsampwidth(audio.get_sample_size(FORMAT)) 
    wavfile.setframerate(RATE)
    wavfile.writeframes(frames)
    wavfile.close() 
    

In [23]:



def predict():
        #Trim silence
#         sound = AudioSegment.from_file('record.wav', format="wav")

#         start_trim = detect_leading_silence(sound)
#         end_trim = detect_leading_silence(sound.reverse())

#         duration = len(sound)    

#         trimmed_sound = sound[start_trim:duration-end_trim]    
#         trimmed_sound.export("trimmed.wav", format="wav")

        #Predict
        record_mfcc = get_mfcc("RECORDING.wav")
        scores = [model[cname].score(record_mfcc) for cname in class_names]
        print(scores)
        predict_word = np.argmax(scores)
        #class_names_vie = ['dup', 'giu', 'nha', 'phai', 'trai']
        print("Kết quả dự đoán: ", class_names[predict_word])

In [32]:
record(count_THESHOLD())
predict() 

recording...
Done
recording...
1243346258488.003
[-18749.731542935162, -18025.45649067457, -17921.277545196543, -18218.95474935066, -18482.043998181372]
Kết quả dự đoán:  nha
