In [7]:
import os
class_names = ['cothe', 'khong', 'nguoi', 'toi', 'va']
states = [12, 9, 9, 9, 6]

length = 0
for d in class_names:
    length += len(os.listdir("content/" + d))
print(length)

502


In [8]:
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm

def get_mfcc(file_path):
    y, sr = librosa.load(file_path) # read .wav file
    hop_length = math.floor(sr*0.010) # 10ms hop
    win_length = math.floor(sr*0.025) # 25ms frame
    # mfcc is 12 x T matrix
    mfcc = librosa.feature.mfcc(
        y, sr, n_mfcc=12, n_fft=1024,
        hop_length=hop_length, win_length=win_length)
    # substract mean from mfcc --> normalize mfcc
    mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1)) 
    # delta feature 1st order and 2nd order
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    # X is 36 x T
    X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
    # return T x 36 (transpose of X)
    return X.T # hmmlearn use T x N matrix


In [9]:
all_data = {}
all_labels = {}
for cname in class_names:
    file_paths = [os.path.join("content", cname, i) for i in os.listdir(os.path.join('content', cname)) if i.endswith('.wav')]
    data = [get_mfcc(file_path) for file_path in file_paths]
    all_data[cname] = data
    all_labels[cname] = [class_names.index(cname) for i in range(len(file_paths))]

In [10]:
from sklearn.model_selection import train_test_split

X = {'train': {}, 'test': {}}
y = {'train': {}, 'test': {}}
for cname in class_names:
    x_train, x_test, _, y_test = train_test_split(
        all_data[cname], all_labels[cname], 
        test_size = 0.33, 
        random_state=42
    )
    X['train'][cname] = x_train
    X['test'][cname] = x_test
    y['test'][cname] = y_test

In [11]:
for cname in class_names:
    print(cname,len(X['train'][cname]), len(X['test'][cname]), len(y['test'][cname]))

cothe 67 33 33
khong 67 33 33
nguoi 67 33 33
toi 67 33 33
va 67 33 33


In [12]:
import hmmlearn.hmm as hmm

model = {}
for idx, cname in enumerate(class_names):
    start_prob = np.full(states[idx], 0.0)
    start_prob[0] = 1.0
    trans_matrix = np.full((states[idx], states[idx]), 0.0)
    p = 0.5
    np.fill_diagonal(trans_matrix, p)
    np.fill_diagonal(trans_matrix[0:, 1:], 1 - p)
    trans_matrix[-1, -1] = 1.0
    
    #trans matrix
    print(trans_matrix) 

    model[cname] = hmm.GaussianHMM(
        n_components=states[idx], 
        verbose=True, 
        n_iter=300, 
        startprob_prior=start_prob, 
        transmat_prior=trans_matrix,
        params='stmc',
        init_params='mc',
        random_state=42
    )

    model[cname].fit(X=np.vstack(X['train'][cname]), lengths=[x.shape[0] for x in X['train'][cname]])

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -236206.7593             +nan
         2     -225730.5762      +10476.1831
         3     -224595.5366       +1135.0396
         4     -224222.8646        +372.6720
         5     -223985.4150        +237.4496
         6     -223789.8347        +195.5803
         7     -223634.0927        +155.7419
         8     -223522.8613        +111.2314
         9     -223416.8431        +106.0182
        10     -223313.2759        +103.5672
        11     -223126.7330        +186.5429
        12     -222824.1845        +302.5485
        13     -222565.5229        +258.6615
        14     -222355.3370        +210.1859
        15     -222219.3829        +135.9542
        16     -222067.8653        +151.5176
        17     -221935.4529        +132.4124
        18     -221854.4594         +80.9935
        19     -221803.8165         +50.6429
        20     -221772.6834         +31.1331
        21     -221743.7898         +28.8936
        22     -221731.4778         +12.3121
        23

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -423512.5011             +nan
         2     -327424.9771      +96087.5240
         3     -145456.3017     +181968.6754
         4      -63440.4643      +82015.8375
         5      -63166.4316        +274.0327
         6      -63087.7807         +78.6509
         7      -63078.0856          +9.6950
         8      -62998.1830         +79.9027
         9      -62931.9984         +66.1845
        10      -62731.9636        +200.0349
        11      -62414.1138        +317.8498
        12      -62222.2139        +191.8999
        13      -62186.7586         +35.4553
        14      -62176.3125         +10.4461
        15      -62171.7087          +4.6039
        16      -62190.9091         -19.2005


[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -161939.4949             +nan
         2     -153966.7282       +7972.7667
         3     -153417.8966        +548.8316
         4     -153250.1910        +167.7055
         5     -153238.3402         +11.8509
         6     -153128.7955        +109.5447
         7     -153051.9377         +76.8578
         8     -152968.3780         +83.5597
         9     -152903.6441         +64.7340
        10     -152870.6810         +32.9631
        11     -152852.3755         +18.3055
        12     -152836.1746         +16.2009
        13     -152810.3262         +25.8484
        14     -152798.2179         +12.1084
        15     -152786.0615         +12.1563
        16     -152772.8116         +13.2500
        17     -152758.5736         +14.2380
        18     -152743.6948         +14.8788
        19     -152733.0769         +10.6178
        20     -152724.4345          +8.6425
        21     -152716.2112          +8.2233
        22     -152701.9163         +14.2949
        23

[[0.5 0.5 0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.5 0.5 0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  0.  0.  0.  1. ]]


         1     -148978.6442             +nan
         2     -141586.7323       +7391.9119
         3     -140344.9342       +1241.7981
         4     -139970.8155        +374.1187
         5     -139790.6343        +180.1812
         6     -139688.0413        +102.5930
         7     -139604.0981         +83.9432
         8     -139518.4337         +85.6644
         9     -139475.3615         +43.0722
        10     -139445.8301         +29.5314
        11     -139427.0741         +18.7560
        12     -139415.5296         +11.5445
        13     -139400.1029         +15.4268
        14     -139381.7746         +18.3282
        15     -139362.3138         +19.4608
        16     -139341.2046         +21.1092
        17     -139328.8394         +12.3652
        18     -139320.5686          +8.2708
        19     -139315.2832          +5.2854
        20     -139311.8707          +3.4125
        21     -139308.7083          +3.1624
        22     -139306.0030          +2.7053
        23

[[0.5 0.5 0.  0.  0.  0. ]
 [0.  0.5 0.5 0.  0.  0. ]
 [0.  0.  0.5 0.5 0.  0. ]
 [0.  0.  0.  0.5 0.5 0. ]
 [0.  0.  0.  0.  0.5 0.5]
 [0.  0.  0.  0.  0.  1. ]]


         1     -139713.7176             +nan
         2     -136023.0503       +3690.6673
         3     -134984.1512       +1038.8991
         4     -134205.7008        +778.4504
         5     -133664.2259        +541.4748
         6     -133462.9905        +201.2355
         7     -133417.3865         +45.6040
         8     -133388.2921         +29.0944
         9     -133360.0712         +28.2209
        10     -133322.9638         +37.1073
        11     -133299.0933         +23.8705
        12     -133286.3805         +12.7128
        13     -133284.9726          +1.4079
        14     -133290.1972          -5.2247


In [13]:
import pickle

# save model
for cname in class_names:
    name = f'model_{cname}.pkl'
    with open(name, 'wb') as file: 
        pickle.dump(model[cname], file)

In [14]:
import pickle, os
import numpy as np

from sklearn.metrics import classification_report

In [15]:
y_true = []
y_pred = []
for cname in class_names:
    for mfcc, target in zip(X['test'][cname], y['test'][cname]):
        scores = [model[cname].score(mfcc) for cname in class_names]
        pred = np.argmax(scores)
        y_pred.append(pred)
        y_true.append(target)
print(y_true)
print(y_pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 2, 4, 4, 4, 4]


In [16]:
report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

       cothe       0.97      0.97      0.97        33
       khong       0.97      1.00      0.99        33
       nguoi       0.94      0.94      0.94        33
         toi       0.94      1.00      0.97        33
          va       0.97      0.88      0.92        33

    accuracy                           0.96       165
   macro avg       0.96      0.96      0.96       165
weighted avg       0.96      0.96      0.96       165



In [17]:
model = {}
for key in class_names:
    name = f"model_{key}.pkl"
    with open(name, 'rb') as file:
        model[key] = pickle.load(file)

In [18]:
print(model)

{'cothe': GaussianHMM(init_params='mc', n_components=12, n_iter=300, random_state=42,
            startprob_prior=array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
            transmat_prior=array([[0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. , 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5, 0. ],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5, 0.5],
       [0. , 0. , 0. , 0. , 0. , 0. , 0. , 

In [33]:
record_mfcc = get_mfcc("mashup.wav")
scores = [model[cname].score(record_mfcc) for cname in class_names]
pred = np.argmax(scores)
print(class_names[pred])

toi


In [30]:
from tkinter import messagebox


def record():
    


def predict():
    record_mfcc = get_mfcc("record.wav")
    scores = [model[cname].score(record_mfcc) for cname in class_names]
    pred = np.argmax(scores)
    messagebox.showinfo("result", class_names[pred])


In [31]:
import tkinter as tk

window = tk.Tk()
window.geometry("300x200")
window.title("record and predict")

frame1 = tk.Frame(master=window)
frame1.pack()

frame2 = tk.Frame(master=window)
frame2.pack()

btn_record = tk.Button(master=frame1, text="record", command=record)
btn_record.pack(side=tk.LEFT)

btn_stop = tk.Button(master=frame1, text="stop")
btn_stop.pack(side=tk.LEFT)

btn_playback = tk.Button(master=frame2, text="playback")
btn_playback.pack(side=tk.LEFT)

btn_predict = tk.Button(master=frame2, text="predict", command=predict)
btn_predict.pack(side=tk.LEFT)

window.mainloop()

* recording


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\Admin\.conda\envs\voice\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "<ipython-input-30-cbc0861442a6>", line 27, in record
    binary = b64decode(data.split(',')[1])
AttributeError: 'Stream' object has no attribute 'split'


In [53]:
import pyaudio
import wave
from base64 import b64decode

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 22050
RECORD_SECONDS = 2
WAVE_OUTPUT_FILENAME = "output.wav"

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

print("* recording")



frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)



print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


* recording
* done recording


In [54]:
from pydub import AudioSegment
import ffmpeg

def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0 # ms

    assert chunk_size > 0 # to avoid infinite loop
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms

sound = AudioSegment.from_file("output.wav", format="wav")

start_trim = detect_leading_silence(sound)
end_trim = detect_leading_silence(sound.reverse())

duration = len(sound)    
trimmed_sound = sound[start_trim:duration-end_trim]

trimmed_sound.export("mashup.wav", format="wav")

<_io.BufferedRandom name='mashup.wav'>

In [55]:
record_mfcc = get_mfcc("mashup.wav")
scores = [model[cname].score(record_mfcc) for cname in class_names]
pred = np.argmax(scores)
print(class_names[pred])

va
