In [1]:
import os
class_names = ['cothe', 'khong', 'nguoi', 'toi', 'va']
states = [12, 9, 9, 9, 6]

length = 0
for d in class_names:
    length += len(os.listdir("content/" + d))
print(length)

366


In [2]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix


In [3]:
all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join("content", cname, i) for i in os.listdir(os.path.join('content', cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for i in range(len(file_paths))]

In [4]:
from sklearn.model_selection import train_test_split

X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}
for cname in class_names:
    x_train, x_test, _, y_test = train_test_split(
        all_data[cname], all_labels[cname], 
        test_size = 0.33, 
        random_state=42
    )
    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

In [5]:
for cname in class_names:
    print(cname,len(X['train'][cname]), len(X['test'][cname]), len(y['test'][cname]))

cothe 62 31 31
khong 40 20 20
nguoi 50 26 26
toi 43 22 22
va 48 24 24


In [6]:
import hmmlearn.hmm as hmm

model = {}
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0
    
    #trans matrix
    print(trans_matrix) 

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx], 
        verbose=True, 
        n_iter=300, 
        startprob_prior=start_prob, 
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )

    model[cname].fit(X=np.vstack(X['train'][cname]), lengths=[x.shape[0] for x in X['train'][cname]])

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -220034.4359             +nan
         2     -209808.7649      +10225.6711
         3     -208366.3818       +1442.3831
         4     -207824.7754        +541.6064
         5     -207479.6624        +345.1131
         6     -207262.7332        +216.9292
         7     -207116.5672        +146.1660
         8     -207054.1379         +62.4293
         9     -207018.9461         +35.1919
        10     -206998.3370         +20.6091
        11     -206973.3910         +24.9460
        12     -206906.5560         +66.8350
        13     -206849.4925         +57.0634
        14     -206799.5544         +49.9381
        15     -206771.3315         +28.2229
        16     -206755.6077         +15.7239
        17     -206747.5960          +8.0117
        18     -206740.8678          +6.7282
        19     -206733.9771          +6.8907
        20     -206728.5621          +5.4150
        21     -206726.5309          +2.0312
        22     -206725.5712          +0.9597
        23

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]



         2     -187535.8366      +35510.9235
         3     -124510.3179      +63025.5187
         4      -52829.5913      +71680.7266
         5      -52596.3078        +233.2835
         6      -52327.3328        +268.9750
         7      -52214.4720        +112.8609
         8      -52167.1035         +47.3684
         9      -52160.3669          +6.7366
        10      -52156.3322          +4.0347
        11      -52146.7284          +9.6038
        12      -52144.7482          +1.9802
        13      -52144.5379          +0.2103
        14      -52144.1018          +0.4361
        15      -52142.6535          +1.4483
        16      -52141.7044          +0.9492
        17      -52141.6477          +0.0567
        18      -52141.6410          +0.0067


[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -129164.7315             +nan
         2     -122314.6084       +6850.1231
         3     -121913.6436        +400.9647
         4     -121789.6269        +124.0167
         5     -121744.5823         +45.0446
         6     -121699.9330         +44.6493
         7     -121667.1387         +32.7943
         8     -121634.7824         +32.3563
         9     -121616.5382         +18.2442
        10     -121610.2878          +6.2504
        11     -121604.9867          +5.3011
        12     -121599.4591          +5.5276
        13     -121596.1221          +3.3370
        14     -121595.5406          +0.5815
        15     -121594.9543          +0.5864
        16     -121594.1569          +0.7974
        17     -121593.6651          +0.4918
        18     -121593.5279          +0.1372
        19     -121593.4587          +0.0692
        20     -121593.4054          +0.0533
        21     -121593.3608          +0.0446
        22     -121593.3222          +0.0386
        23

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -112166.6267             +nan
         2     -105725.9501       +6440.6766
         3     -104774.2684        +951.6817
         4     -104632.7834        +141.4850
         5     -104658.4511         -25.6677


[[0.5 0.5 0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  1. ]]


         1     -115615.6869             +nan
         2     -111084.7404       +4530.9466
         3     -109773.5095       +1311.2308
         4     -109278.6511        +494.8584
         5     -109067.2849        +211.3663
         6     -108974.1498         +93.1351
         7     -108911.4847         +62.6651
         8     -108861.0541         +50.4306
         9     -108780.7458         +80.3083
        10     -108745.8405         +34.9053
        11     -108741.0728          +4.7677
        12     -108736.9570          +4.1158
        13     -108687.5459         +49.4111
        14     -108683.6157          +3.9302
        15     -108676.8121          +6.8036
        16     -108674.4786          +2.3335
        17     -108674.0528          +0.4258
        18     -108673.9529          +0.0999
        19     -108673.8999          +0.0530
        20     -108673.8644          +0.0355
        21     -108673.8387          +0.0257
        22     -108673.8199          +0.0188
        23

In [7]:
import pickle

# save model
for cname in class_names:
    name = f'model_{cname}.pkl'
    with open(name, 'wb') as file: 
        pickle.dump(model[cname], file)

In [8]:
import pickle, os
import numpy as np

from sklearn.metrics import classification_report

In [9]:
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
print(y_true)
print(y_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [10]:
report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

       cothe       0.97      0.97      0.97        31
       khong       0.95      0.95      0.95        20
       nguoi       0.96      1.00      0.98        26
         toi       1.00      1.00      1.00        22
          va       1.00      0.96      0.98        24

    accuracy                           0.98       123
   macro avg       0.98      0.98      0.98       123
weighted avg       0.98      0.98      0.98       123



In [11]:
model = {}
for key in class_names:
    name = f"model_{key}.pkl"
    with open(name, 'rb') as file:
        model[key] = pickle.load(file)

In [12]:
print(model)

{'cothe': GaussianHMM(init_params='mc', n_components=12, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 

In [13]:
from tkinter import messagebox
import winsound

from pydub import AudioSegment

import ffmpeg

#Thay đổi threshold dựa vào tạp âm, càng ồn thì threshold càng lớn
def detect_leading_silence(sound, silence_threshold=-42.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

def record():
    import pyaudio
    import wave
    from base64 import b64decode

    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 22050
    RECORD_SECONDS = 2
    WAVE_OUTPUT_FILENAME = "record.wav"

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

def play():    
    filename = 'record.wav'
    winsound.PlaySound(filename, winsound.SND_FILENAME)
    
def playtrimmed():    
    filename = 'trimmed.wav'
    winsound.PlaySound(filename, winsound.SND_FILENAME)

def predict():
    #Trim silence
    sound = AudioSegment.from_file("record.wav", format="wav")

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    
    trimmed_sound = sound[start_trim:duration-end_trim]    
    trimmed_sound.export("trimmed.wav", format="wav")
    
    #Predict
    record_mfcc = get_mfcc("trimmed.wav")
    scores = [model[cname].score(record_mfcc) for cname in class_names]
    pred = np.argmax(scores)
    messagebox.showinfo("result", class_names[pred])




In [16]:
import tkinter as tk

window = tk.Tk()
window.geometry("300x200")
window.title("Speech recognition")

frame0 = tk.Frame(master=window)
frame0.pack()

frame1 = tk.Frame(master=window)
frame1.pack()

frame2 = tk.Frame(master=window)
frame2.pack()

label = tk.Label(master=frame0, text="Speech recognition")
label.pack(padx=5, pady=10)

btn_record = tk.Button(master=frame1, width=13, height=2, text="record", command=record)
btn_record.pack(side=tk.LEFT, padx=5, pady=5)

btn_playback = tk.Button(master=frame1, width=13, height=2, text="playback", command=play)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)

btn_predict = tk.Button(master=frame2, width=13, height=2, text="trim & predict", command=predict)
btn_predict.pack(side=tk.LEFT, padx=5, pady=5)

btn_playback = tk.Button(master=frame2, width=13, height=2, text="playbacktrimmed", command=playtrimmed)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)



window.mainloop()