<a href="https://colab.research.google.com/github/vlozg/speech_hmm/blob/main/Test_DiagHMM_021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LƯU Ý: NOTEBOOK NÀY CHỈ DÙNG ĐỂ SHOW KẾT QUẢ CHẠY, KHÔNG NÊN CHẠY LẠI NOTEBOOK NÀY 
(VÌ TRONG NÀY CÓ CODE LƯU LẠI PRETRAINED MODEL LÊN DRIVE SẼ BỊ XÓA)

# Speech to text with HMM

- **Bài toán**: Chuyển giọng nói thành văn bản
    - **Input**: Đoạn ghi âm chứa nội dung là các số từ 0 đến 9
    - **Output**: Phân lớp của đoạn ghi âm

# Các biến thiết lập cho thử nghiệm

In [None]:
n_mfcc_ceptrum = 12
n_delta_features = 1
n_mixtures = 9
fsdd_split = 0.3
wolfram_split = 0.3
experiment_id = '021'

In [None]:
n_mfcc_features = n_mfcc_ceptrum * (1+n_delta_features)
n_mfcc_features

24

# Import và cài đặt thư viện

In [None]:
# cài lib. note: cài xong phải restart runtime
!pip install pydub
!pip install pomegranate



In [None]:
# Xác thực google để upload/download qua google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Quản lý file, folder
import os
from shutil import copyfile, rmtree
import random

# Xử lý audio
import librosa
import librosa.display
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pomegranate # Thư viện cho mô hình xác suất
from pomegranate import *

# Tải dữ liệu và lấy xác thực Google

Dữ liệu dùng để huấn luyện và đánh giá, còn xác thực google thì dùng để upload/download mô hình trên drive.

In [None]:
%%capture
# download wolfram
if not os.path.isfile('./dataset_1_wolfram.zip'):
  !gdown --id 115tIAitBNeJC0DwrP-ZyJ6RS3TyWN0qD
  !unzip -o dataset_1_wolfram.zip

# dowload FSDD
if not os.path.isfile('./dataset_2_FSDD.zip'):
  !gdown --id 1Ua9zlPBc0Fv4xGHSQTb7eIvUh_dqFI6P
  !unzip -o dataset_2_FSDD.zip

# download self recorded audio
!gdown --id 1lH_k1AYMVlJvodtZdD7OK2zkdPXxlW9i

In [None]:
# Lấy xác thực google để upload/download file
auth.authenticate_user()
gauth =  GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Hàm xử lý âm thanh

In [None]:
def minmax_scale(wave):
  return ((wave - wave.min()) / (wave.max() - wave.min()) - 0.5)

def standard_scale(wave):
  return ((wave - wave.mean()) / wave.std())

def scaleAddNoise(wave):
  return standard_scale(wave) + 0.2*np.random.normal(size=wave.shape)

'''
  Hàm đọc audio
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_audio(full_audio_path):
  # Bắt buộc âm thanh đọc vào phải cùng sample rate là 16000
  sample_rate, wave =  wavfile.read(full_audio_path)
  if sample_rate != 16000:
    wave, sample_rate =  librosa.load(full_audio_path, sr=16000) # Hàm đọc của librosa quá chậm, scipy nhanh gấp 5-10 lần
  else:
    wave=wave/32768 # Chuẩn hóa về số thực
  return wave, sample_rate

'''
  Hàm đọc audio, có thêm khoảng trắng ở 2 đầu audio,
  scale lại waveform và thêm white noise
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_process_audio(full_audio_path):
  wave, sample_rate = read_audio(full_audio_path)
  wave = np.pad(wave, (2000,2000), 'constant', constant_values=(0.0,0.0))
  wave = scaleAddNoise(wave)
  return wave, sample_rate

def unvoiced_frame(wave, sample_rate, min_len = 10):
  rms = librosa.feature.rms(y=wave)[0]
  r_normalized = standard_scale(rms)
  p = np.exp(r_normalized) / (1 + np.exp(r_normalized))
  # Giảm dần threshold xuống nếu không đủ min_len để train HMM
  thresh = 0.4
  slice_ = p > thresh
  while (slice_.sum() < min_len):
    thresh-=0.05
    slice_ = p > thresh
  return slice_

def extract_mfcc(wave, sample_rate, trim=True):
  S = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_mels=40)
  if trim:
    S = S[:,unvoiced_frame(wave, sample_rate)]
  mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfcc_ceptrum+2, lifter=40)[2:,:].T/800
  return mfccs

def mfcc_delta_features(mfcc, order):
  if order==0:
    return mfcc
  dmfcc = librosa.feature.delta(mfcc, order=order)
  return dmfcc

def full_mfcc_from_file(full_audio_path, trim=True):
  wave, sr = read_process_audio(full_audio_path)
  mfccs = extract_mfcc(wave, sr, trim)
  full_mfccs = mfccs
  if n_delta_features >= 1:
    mfccs_d1 = mfcc_delta_features(mfccs, 1)
    full_mfccs = np.hstack([full_mfccs,mfccs_d1])
  if n_delta_features >= 2:
    mfccs_d2 = mfcc_delta_features(mfccs, 2)
    full_mfccs = np.hstack([full_mfccs,mfccs_d2])
  return full_mfccs

# Hàm bổ trợ

In [None]:
def buildDataSet(dir, trim=True):
    # Filter out the wav audio files under the dir
    fileList = [f for f in os.listdir(dir) if os.path.splitext(f)[1] == '.wav']
    dataset = {}
    for fileName in fileList:
        tmp = fileName.split('.')[0]
        label = tmp.split('_')[1]

        # label = filename.split('_')[0]
        feature = full_mfcc_from_file(dir+fileName, trim)
        if label not in dataset.keys():
            dataset[label] = []
            dataset[label].append(feature)
        else:
            exist_feature = dataset[label]
            exist_feature.append(feature)
            dataset[label] = exist_feature
    return dataset

def makeTrainTestDir(mainFolder, filenames, rate):
    paths = [f'{mainFolder}',
             f'{mainFolder}/train_audio',
             f'{mainFolder}/test_audio']

    for path in paths:
        try:
            os.mkdir(path)
        except:
            rmtree(path)
            os.mkdir(path)

    random.seed(1)
    random.shuffle(filenames)
    splitPoint = int(len(filenames)*rate)
    trainFilenames = filenames[:splitPoint]
    testFilenames = filenames[splitPoint:]

    for filename in trainFilenames:
        copyfile(filename[0], f"{paths[1]}/{filename[1]}")

    for filename in testFilenames:
        copyfile(filename[0], f"{paths[2]}/{filename[1]}")

# Setup thư mục chứa data

In [None]:
def formatFilenameFSDD(dir):
    filenames = []
    count = 0
    for filename in os.listdir(dir):
        tmp = str(count) + '_' + filename.split('_')[0] + '.wav'
        filenames.append((f"{dir}/{filename}", tmp))
        count += 1
    return filenames

filenames = formatFilenameFSDD('./dataset_2_FSDD')
makeTrainTestDir('fsdd', filenames, fsdd_split)

In [None]:
def formatFilenameWolfram(dir):
    filenames = []
    count = 0
    folders = os.listdir(dir)
    for folder in folders:
        for filename in os.listdir(f'{dir}/{folder}'):
            tmp = str(count) + '_' + folder.split('_')[0] + '.wav'
            filenames.append((f"{dir}/{folder}/{filename}", tmp))
            count += 1

    return filenames

filenames = formatFilenameWolfram('./dataset_1_wolfram')
makeTrainTestDir('wolfram', filenames, wolfram_split)

# Huấn luyện mô hình (có thể bỏ qua vì mô hình đã save trên drive)

## Hàm train mô hình

In [None]:
def Generate_DiagGMM(full_fset, n_features, n_states=5, n_cmps=3):
  dists = []
  for state_i in range(n_states):
    if n_cmps > 1:
        mixtures = []
        for cmp_i in range(n_cmps):
          cmp = IndependentComponentsDistribution(tuple(
              NormalDistribution(*np.random.random(2))
              for feat_i in range(n_features)
              ))
          mixtures.append(cmp)
        comp = GeneralMixtureModel(mixtures)
    else:
        comp = IndependentComponentsDistribution(tuple(
            NormalDistribution(*np.random.random(2))
            for feat_i in range(n_features)
            ))
    dists.append(comp)

  return dists

In [None]:
# Generate progressive HMM model
def left_right_GMMHMM(seed_sample, x_dim, n_states=10, n_modals=9, diag=True, random=0):
  rng = np.random.RandomState(random)
  if random!=0:
    init_prob = lambda: rng.rand(1)[0]
    
  model = HiddenMarkovModel()
  if diag:
    states = [State(gmm, name=f"H{i}") for i, gmm in enumerate(Generate_DiagGMM(seed_sample, x_dim, n_states, n_modals))]
  else:
    states = [State( GeneralMixtureModel([MultivariateGaussianDistribution.blank(x_dim) for i in range(n_modals)]),
                    name=f"H{i}" ) for i in range(n_states)]
  model.add_states(states)
  model.add_transition(model.start, states[0], 1)
  for i in range(n_states-1):
    model.add_transition(states[i], states[i], 0.5)
    model.add_transition(states[i], states[i+1], 0.5)
  model.add_transition(states[n_states-1], states[n_states-1], 0.5)
  model.add_transition(states[n_states-1], model.end, 0.5)
  model.bake()
  return model

In [None]:
tol = 1000
def train_GMMHMM(dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True, failed_label_return=False, report_fail=None):
    GMMHMM_Models = dict()

    for label in dataset.keys():
        print(f"Training model detect {label}")
        for i in range(tol):
          model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
          _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
          if ~np.isnan(imprv.improvements[-1]):
            break
          model = None
        if model is None and failed_label_return:
          report_fail.append(label)
        GMMHMM_Models[label] = model

    return GMMHMM_Models

In [None]:
def retrain_specific_class(hmmModels, label, dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True):
  for i in range(tol):
    model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
    _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
    if ~np.isnan(imprv.improvements[-1]):
      break
  hmmModels[label] = model
  return hmmModels

## Hàm test mô hình

In [None]:
# test model
def evaluateModel(testDataset, model):
  if (len(testDataset) == 0):
    return
  digit_clf = BayesClassifier(list(dict(sorted(model.items())).values()))
  true_cnt = 0
  total = 0
  for label in sorted(testDataset.keys()):
      features = np.array(testDataset[label], dtype='object')
      pred = digit_clf.predict(features)
      iter_cnt = (pred == int(label)).sum()
      iter_total = len(features)
      total += iter_total
      true_cnt += iter_cnt
      print(f"{label}: {iter_cnt}/{iter_total} ({iter_cnt/iter_total})")
  print("Final recognition rate is %.2f"%(100.0*true_cnt/total), "%")

## Đọc và tiền xử lý data

In [None]:
# prepare data for training
master_path = 'fsdd'

trainDir = master_path + '/train_audio/'
trainDataSet_fsdd = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_fsdd = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
# prepare data for training
master_path = 'wolfram'

trainDir = master_path + '/train_audio/'
trainDataSet_wolfram = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_wolfram = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
trainDataSet = trainDataSet_fsdd
if wolfram_split > 0:
  for label in trainDataSet_fsdd.keys():
    trainDataSet[label] = trainDataSet_fsdd[label] + trainDataSet_wolfram[label]

## **Mô hình 1**
- 10 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
# Kiểm tra đảm bảo chuỗi có độ dài nhỏ nhất không nhỏ hơn số state
for label in trainDataSet.keys():
  print(min(map(len, trainDataSet[label])))

10
10
10
10
10
10
10
10
10
10


In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,10,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 1066547.7706669972	Time (s): 2.118
[2] Improvement: nan	Time (s): 2.093
Total Training Improvement: nan
Total Training Time (s): 6.4154
[1] Improvement: 988641.2104113591	Time (s): 2.109
[2] Improvement: nan	Time (s): 2.084
Total Training Improvement: nan
Total Training Time (s): 6.3050
[1] Improvement: 1030361.5791924741	Time (s): 2.143
[2] Improvement: nan	Time (s): 2.103
Total Training Improvement: nan
Total Training Time (s): 6.4111
[1] Improvement: 1099018.245530284	Time (s): 2.149
[2] Improvement: 40915.80334148987	Time (s): 2.127
[3] Improvement: nan	Time (s): 2.149
Total Training Improvement: nan
Total Training Time (s): 8.6035
[1] Improvement: 1159463.3936302299	Time (s): 2.144
[2] Improvement: nan	Time (s): 2.092
Total Training Improvement: nan
Total Training Time (s): 6.3813
[1] Improvement: nan	Time (s): 2.173
Total Training Improvement: nan
Total Training Time (s): 4.2868
[1] Improvement: 1166751.4810570413	Time (s): 2.132
[2] Impro

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 637/795 (0.8012578616352202)
1: 666/837 (0.7956989247311828)
2: 504/745 (0.676510067114094)
3: 557/793 (0.7023959646910467)
4: 635/803 (0.7907845579078456)
5: 678/812 (0.8349753694581281)
6: 700/846 (0.8274231678486997)
7: 635/789 (0.8048162230671736)
8: 556/795 (0.6993710691823899)
9: 615/784 (0.7844387755102041)
Final recognition rate is 77.30 %
Evaluate on rest of FSDD set
0: 177/211 (0.8388625592417062)
1: 156/208 (0.75)
2: 154/214 (0.719626168224299)
3: 117/218 (0.536697247706422)
4: 199/214 (0.9299065420560748)
5: 197/203 (0.9704433497536946)
6: 122/199 (0.6130653266331658)
7: 199/234 (0.8504273504273504)
8: 142/197 (0.7208121827411168)
9: 126/202 (0.6237623762376238)
Final recognition rate is 75.67 %
Evaluate on wolfram test set
0: 1312/1670 (0.7856287425149701)
1: 1243/1625 (0.7649230769230769)
2: 1162/1714 (0.6779463243873979)
3: 1153/1645 (0.7009118541033434)
4: 1276/1655 (0.7709969788519637)
5: 1304/1642 (0.7941534713763703)
6: 1338/1624 (0.823891625

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_10_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 2**
- 5 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,5,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 955195.7674027572	Time (s): 1.105
[2] Improvement: 31879.552070218488	Time (s): 1.145
[3] Improvement: 13741.09414319531	Time (s): 1.105
[4] Improvement: 5057.048558901879	Time (s): 1.106
[5] Improvement: 3268.455333042657	Time (s): 1.101
[6] Improvement: 3016.0568609527545	Time (s): 1.113
[7] Improvement: 3981.786601045169	Time (s): 1.109
[8] Improvement: 3764.351430685958	Time (s): 1.092
[9] Improvement: 2374.506513570319	Time (s): 1.08
[10] Improvement: 1362.2894766811514	Time (s): 1.109
[11] Improvement: 1357.6466141362907	Time (s): 1.086
[12] Improvement: 1451.2938166076783	Time (s): 1.069
[13] Improvement: 951.4202190180076	Time (s): 1.079
[14] Improvement: 550.0272656984162	Time (s): 1.099
[15] Improvement: 380.52204711711966	Time (s): 1.07
[16] Improvement: 253.86444995226339	Time (s): 1.082
[17] Improvement: 158.03198806929868	Time (s): 1.045
[18] Improvement: 96.7249335644301	Time (s): 1.055
[19] Improvement: 77.3673048322089	Time (s):

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 532/795 (0.6691823899371069)
1: 423/837 (0.5053763440860215)
2: 460/745 (0.6174496644295302)
3: 505/793 (0.6368221941992434)
4: 671/803 (0.8356164383561644)
5: 603/812 (0.7426108374384236)
6: 725/846 (0.8569739952718676)
7: 498/789 (0.6311787072243346)
8: 584/795 (0.7345911949685534)
9: 665/784 (0.8482142857142857)
Final recognition rate is 70.83 %
Evaluate on rest of FSDD set
0: 159/211 (0.7535545023696683)
1: 91/208 (0.4375)
2: 129/214 (0.602803738317757)
3: 114/218 (0.5229357798165137)
4: 198/214 (0.9252336448598131)
5: 170/203 (0.8374384236453202)
6: 121/199 (0.6080402010050251)
7: 158/234 (0.6752136752136753)
8: 167/197 (0.8477157360406091)
9: 145/202 (0.7178217821782178)
Final recognition rate is 69.14 %
Evaluate on wolfram test set
0: 1106/1670 (0.6622754491017964)
1: 795/1625 (0.48923076923076925)
2: 982/1714 (0.572928821470245)
3: 1000/1645 (0.60790273556231)
4: 1324/1655 (0.8)
5: 1200/1642 (0.730816077953715)
6: 1425/1624 (0.8774630541871922)
7: 1068/

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_5_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 3**
- 3 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,3,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 1220217.671184714	Time (s): 0.6979
[2] Improvement: 25526.873210724443	Time (s): 0.6649
[3] Improvement: 17953.528052995098	Time (s): 0.6752
[4] Improvement: 11265.644176566158	Time (s): 0.6729
[5] Improvement: 8323.74458606448	Time (s): 0.6628
[6] Improvement: 7869.979322778527	Time (s): 0.6758
[7] Improvement: 6074.987892573001	Time (s): 0.6596
[8] Improvement: 3453.519978838158	Time (s): 0.6513
[9] Improvement: 1811.9573465511203	Time (s): 0.6539
[10] Improvement: 976.6637570433086	Time (s): 0.6667
[11] Improvement: 501.1786977595184	Time (s): 0.6743
[12] Improvement: 242.15447833866347	Time (s): 0.6595
[13] Improvement: 127.04867449449375	Time (s): 0.67
[14] Improvement: 68.69685349229258	Time (s): 0.6569
[15] Improvement: 42.50138792360667	Time (s): 0.6555
[16] Improvement: 21.395048126927577	Time (s): 0.6569
[17] Improvement: 15.841081608901732	Time (s): 0.6682
[18] Improvement: 13.863532221410424	Time (s): 0.6571
[19] Improvement: 12.6823

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 654/795 (0.8226415094339623)
1: 493/837 (0.5890083632019116)
2: 500/745 (0.6711409395973155)
3: 404/793 (0.5094577553593947)
4: 588/803 (0.7322540473225405)
5: 705/812 (0.8682266009852216)
6: 539/846 (0.6371158392434988)
7: 570/789 (0.7224334600760456)
8: 140/795 (0.1761006289308176)
9: 306/784 (0.3903061224489796)
Final recognition rate is 61.25 %
Evaluate on rest of FSDD set
0: 163/211 (0.7725118483412322)
1: 69/208 (0.3317307692307692)
2: 170/214 (0.794392523364486)
3: 81/218 (0.37155963302752293)
4: 179/214 (0.8364485981308412)
5: 197/203 (0.9704433497536946)
6: 112/199 (0.5628140703517588)
7: 156/234 (0.6666666666666666)
8: 16/197 (0.08121827411167512)
9: 42/202 (0.2079207920792079)
Final recognition rate is 56.43 %
Evaluate on wolfram test set
0: 1372/1670 (0.8215568862275449)
1: 984/1625 (0.6055384615384616)
2: 1110/1714 (0.6476079346557759)
3: 810/1645 (0.49240121580547114)
4: 1179/1655 (0.7123867069486405)
5: 1394/1642 (0.8489646772228989)
6: 1040/1624

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_3_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()