In [1]:
import os
class_names = ['l', 'n']
states = [9, 9]

length = 0
for d in class_names:
    length += len(os.listdir("datatrim/" + d))
print(length)

In [2]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix


In [45]:
all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join("datatrim", cname, i) for i in os.listdir(os.path.join('datatrim', cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for i in range(len(file_paths))]
print(all_labels)

{'l': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'n': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [4]:
from sklearn.model_selection import train_test_split

X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}
for cname in class_names:
    x_train, x_test, _, y_test = train_test_split(
        all_data[cname], all_labels[cname], 
        test_size = 0.25, 
        random_state=42
    )
    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

In [5]:
for cname in class_names:
    print(cname,len(X['train'][cname]), len(X['test'][cname]), len(y['test'][cname]))

l 37 13 13
n 39 14 14


In [8]:
import hmmlearn.hmm as hmm

model = {}
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1
    
    if (cname=='l'):
        model[cname] = hmm.GaussianHMM(
            n_components=states[idx], 
            verbose=True, 
            n_iter=300, 
            startprob_prior=start_prob, 
            transmat_prior=np.array([
                [0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
                [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
                [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
                [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
                [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
                [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
                [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
                [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
                [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
            ]),
            params='stmc',
            init_params='mc',
            random_state=42
        )
    
    if (cname=='n'):
        model[cname] = hmm.GaussianHMM(
            n_components=states[idx], 
            verbose=True, 
            n_iter=300, 
            startprob_prior=start_prob, 
            transmat_prior=np.array([
                [0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
                [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
                [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
                [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
                [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
                [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
                [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
                [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
                [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ],
            ]),
            params='stmc',
            init_params='mc',
            random_state=42
        )

    model[cname].fit(X=np.vstack(X['train'][cname]), lengths=[x.shape[0] for x in X['train'][cname]])

         1     -135621.0248             +nan
         2     -119555.3012      +16065.7236
         3     -118267.8231       +1287.4780
         4     -118059.8318        +207.9913
         5     -117960.8672         +98.9646
         6     -117923.8994         +36.9678
         7     -117906.2718         +17.6276
         8     -117897.8039          +8.4678
         9     -117886.4904         +11.3135
        10     -117851.2131         +35.2773
        11     -117835.2809         +15.9322
        12     -117838.9078          -3.6269
         1     -144563.6046             +nan
         2     -126638.2679      +17925.3367
         3     -125270.2022       +1368.0657
         4     -125011.9718        +258.2304
         5     -124792.5329        +219.4389
         6     -124626.9085        +165.6243
         7     -124518.1454        +108.7631
         8     -124431.8887         +86.2568
         9     -124384.1264         +47.7623
        10     -124342.4208         +41.7056
        11

In [9]:
import pickle

# save model
for cname in class_names:
    name = f'models\model_{cname}.pickle'
    print(model[cname])
    with open(name, 'wb') as file: 
        pickle.dump(model[cname], file)

GaussianHMM(init_params='mc', n_components=9, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ]]),
            verbose=True)
GaussianHMM(init_params='mc', n_components=9, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 

In [10]:
import pickle, os
import numpy as np

from sklearn.metrics import classification_report

In [11]:
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        print(scores)
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
print(y_true)
print(y_pred)

[-3321.0780409043027, -3551.8825519768475]
[-3598.008388383607, -3629.0256626679848]
[-3483.446027928627, -3722.2023257690457]
[-3180.914977923364, -3209.679604779526]
[-2919.224240525048, -3036.4069514583184]
[-3004.1852750895455, -3199.1156515301604]
[-2824.5525964984713, -3028.4247024097567]
[-3327.156134690217, -3451.2998291038803]
[-3739.804923762279, -3873.3870776654917]
[-3338.272901829514, -3386.414512403117]
[-3328.4855508014475, -3430.108590071496]
[-2280.152608221956, -2456.5268320782225]
[-3045.289058800544, -3232.1105199888675]
[-3362.0487063110127, -3089.4850803950603]
[-3393.248787937675, -3189.712240026204]
[-4358.872877173113, -4205.581058486786]
[-3076.4101789410215, -2850.5907283109946]
[-2925.587879967438, -2778.761026128627]
[-3068.053080101462, -2871.507771631816]
[-2935.575642807935, -2727.3151022614707]
[-3214.8386274640475, -3092.4862992666917]
[-3360.276340644684, -3132.1880122153866]
[-3048.127928832758, -2767.523269750298]
[-3330.600707026222, -3118.75571502

In [12]:
report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

           l       1.00      1.00      1.00        13
           n       1.00      1.00      1.00        14

    accuracy                           1.00        27
   macro avg       1.00      1.00      1.00        27
weighted avg       1.00      1.00      1.00        27



In [13]:
#loadmodels
import pickle

model = {}
for key in class_names:
    name = f"models\model_{key}.pkl"
    with open(name, 'rb') as file:
        model[key] = pickle.load(file)

In [14]:
from tkinter import messagebox
import winsound

from pydub import AudioSegment

import ffmpeg

#Thay đổi threshold dựa vào tạp âm, càng ồn thì threshold càng lớn
def detect_leading_silence(sound, silence_threshold=-35.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

def record():
    import pyaudio
    import wave
    from base64 import b64decode

    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 2
    WAVE_OUTPUT_FILENAME = "record.wav"

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

def play():    
    filename = 'record.wav'
    winsound.PlaySound(filename, winsound.SND_FILENAME)
    
def playtrimmed():    
    filename = 'trimmed.wav'
    winsound.PlaySound(filename, winsound.SND_FILENAME)

def predict():
    #Trim silence
    sound = AudioSegment.from_file("record.wav", format="wav")

    start_trim = detect_leading_silence(sound)
    end_trim = detect_leading_silence(sound.reverse())

    duration = len(sound)    
    
    trimmed_sound = sound[start_trim:duration-end_trim]    
    trimmed_sound.export("trimmed.wav", format="wav")
    
    #Predict
    record_mfcc = get_mfcc("trimmed.wav")
    scores = [model[cname].score(record_mfcc) for cname in class_names]
    pred = np.argmax(scores)
    messagebox.showinfo("result", class_names[pred])


In [15]:
import tkinter as tk

window = tk.Tk()
window.geometry("300x200")
window.title("Speech recognition")

frame0 = tk.Frame(master=window)
frame0.pack()

frame1 = tk.Frame(master=window)
frame1.pack()

frame2 = tk.Frame(master=window)
frame2.pack()

label = tk.Label(master=frame0, text="Speech recognition")
label.pack(padx=5, pady=10)

btn_record = tk.Button(master=frame1, width=13, height=2, text="record", command=record)
btn_record.pack(side=tk.LEFT, padx=5, pady=5)

btn_playback = tk.Button(master=frame1, width=13, height=2, text="playback", command=play)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)

btn_predict = tk.Button(master=frame2, width=13, height=2, text="trim & predict", command=predict)
btn_predict.pack(side=tk.LEFT, padx=5, pady=5)

btn_playback = tk.Button(master=frame2, width=13, height=2, text="playbacktrimmed", command=playtrimmed)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)



window.mainloop()

In [16]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

sound_file = AudioSegment.from_wav("data.wav")
audio_chunks = split_on_silence(sound_file, 
    # must be silent for at least half a second
    min_silence_len=50,

    # consider it silent if quieter than -16 dBFS
    silence_thresh=-16
)

for i, chunk in enumerate(audio_chunks):

    out_file = "./splitAudio/chunk{0}.wav".format(i)
    print ("exporting", out_file)
    chunk.export(out_file, format="wav")

exporting ./splitAudio/chunk0.wav
exporting ./splitAudio/chunk1.wav
exporting ./splitAudio/chunk2.wav
exporting ./splitAudio/chunk3.wav
exporting ./splitAudio/chunk4.wav
exporting ./splitAudio/chunk5.wav


In [42]:
from pydub import AudioSegment
from pydub.utils import make_chunks

myaudio = AudioSegment.from_file("./splitAudio/chunk4.wav" , "wav") 
chunk_length_ms = 200 # pydub calculates in millisec
chunks = make_chunks(myaudio, chunk_length_ms)

#Export all of the individual chunks as wav files

for i, chunk in enumerate(chunks):
    chunk_name = "./letter/chunk{0}.wav".format(i)
    print ("exporting", chunk_name)
    chunk.export(chunk_name, format="wav")

exporting ./letter/chunk0.wav
exporting ./letter/chunk1.wav
exporting ./letter/chunk2.wav


In [43]:
scores = [model[cname].score(get_mfcc("./letter/chunk0.wav")) for cname in class_names]
print(scores)
pred = np.argmax(scores)
print(class_names[pred])

[-3245.9259397059077, -2953.908661911676]
n


In [19]:
with open('models.pickle', 'wb') as file: 
        pickle.dump(model, file)
with open('models.pickle', 'rb') as file:
        modelss = pickle.load(file)
print(modelss)

{'l': GaussianHMM(init_params='mc', n_components=9, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 1. ]]),
            verbose=True), 'n': GaussianHMM(init_params='mc', n_components=9, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. 