<a href="https://colab.research.google.com/github/vlozg/speech_hmm/blob/main/Test_DiagHMM_020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LƯU Ý: NOTEBOOK NÀY CHỈ DÙNG ĐỂ SHOW KẾT QUẢ CHẠY, KHÔNG NÊN CHẠY LẠI NOTEBOOK NÀY 
(VÌ TRONG NÀY CÓ CODE LƯU LẠI PRETRAINED MODEL LÊN DRIVE SẼ BỊ XÓA)

# Speech to text with HMM

- **Bài toán**: Chuyển giọng nói thành văn bản
    - **Input**: Đoạn ghi âm chứa nội dung là các số từ 0 đến 9
    - **Output**: Phân lớp của đoạn ghi âm

# Các biến thiết lập cho thử nghiệm

In [None]:
n_mfcc_ceptrum = 12
n_delta_features = 1
n_mixtures = 9
fsdd_split = 0.4
wolfram_split = 0
experiment_id = '020'

In [None]:
n_mfcc_features = n_mfcc_ceptrum * (1+n_delta_features)
n_mfcc_features

24

# Import và cài đặt thư viện

In [None]:
# cài lib. note: cài xong phải restart runtime
!pip install pydub
!pip install pomegranate



In [None]:
# Xác thực google để upload/download qua google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Quản lý file, folder
import os
from shutil import copyfile, rmtree
import random

# Xử lý audio
import librosa
import librosa.display
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pomegranate # Thư viện cho mô hình xác suất
from pomegranate import *

# Tải dữ liệu và lấy xác thực Google

Dữ liệu dùng để huấn luyện và đánh giá, còn xác thực google thì dùng để upload/download mô hình trên drive.

In [None]:
%%capture
# download wolfram
if not os.path.isfile('./dataset_1_wolfram.zip'):
  !gdown --id 115tIAitBNeJC0DwrP-ZyJ6RS3TyWN0qD
  !unzip -o dataset_1_wolfram.zip

# dowload FSDD
if not os.path.isfile('./dataset_2_FSDD.zip'):
  !gdown --id 1Ua9zlPBc0Fv4xGHSQTb7eIvUh_dqFI6P
  !unzip -o dataset_2_FSDD.zip

# download self recorded audio
!gdown --id 1lH_k1AYMVlJvodtZdD7OK2zkdPXxlW9i

In [None]:
# Lấy xác thực google để upload/download file
auth.authenticate_user()
gauth =  GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Hàm xử lý âm thanh

In [None]:
def minmax_scale(wave):
  return ((wave - wave.min()) / (wave.max() - wave.min()) - 0.5)

def standard_scale(wave):
  return ((wave - wave.mean()) / wave.std())

def scaleAddNoise(wave):
  return standard_scale(wave) + 0.2*np.random.normal(size=wave.shape)

'''
  Hàm đọc audio
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_audio(full_audio_path):
  # Bắt buộc âm thanh đọc vào phải cùng sample rate là 16000
  sample_rate, wave =  wavfile.read(full_audio_path)
  if sample_rate != 16000:
    wave, sample_rate =  librosa.load(full_audio_path, sr=16000) # Hàm đọc của librosa quá chậm, scipy nhanh gấp 5-10 lần
  else:
    wave=wave/32768 # Chuẩn hóa về số thực
  return wave, sample_rate

'''
  Hàm đọc audio, có thêm khoảng trắng ở 2 đầu audio,
  scale lại waveform và thêm white noise
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_process_audio(full_audio_path):
  wave, sample_rate = read_audio(full_audio_path)
  wave = np.pad(wave, (2000,2000), 'constant', constant_values=(0.0,0.0))
  wave = scaleAddNoise(wave)
  return wave, sample_rate

def unvoiced_frame(wave, sample_rate, min_len = 10):
  rms = librosa.feature.rms(y=wave)[0]
  r_normalized = standard_scale(rms)
  p = np.exp(r_normalized) / (1 + np.exp(r_normalized))
  # Giảm dần threshold xuống nếu không đủ min_len để train HMM
  thresh = 0.4
  slice_ = p > thresh
  while (slice_.sum() < min_len):
    thresh-=0.05
    slice_ = p > thresh
  return slice_

def extract_mfcc(wave, sample_rate, trim=True):
  S = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_mels=40)
  if trim:
    S = S[:,unvoiced_frame(wave, sample_rate)]
  mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfcc_ceptrum+2, lifter=40)[2:,:].T/800
  return mfccs

def mfcc_delta_features(mfcc, order):
  if order==0:
    return mfcc
  dmfcc = librosa.feature.delta(mfcc, order=order)
  return dmfcc

def full_mfcc_from_file(full_audio_path, trim=True):
  wave, sr = read_process_audio(full_audio_path)
  mfccs = extract_mfcc(wave, sr, trim)
  full_mfccs = mfccs
  if n_delta_features >= 1:
    mfccs_d1 = mfcc_delta_features(mfccs, 1)
    full_mfccs = np.hstack([full_mfccs,mfccs_d1])
  if n_delta_features >= 2:
    mfccs_d2 = mfcc_delta_features(mfccs, 2)
    full_mfccs = np.hstack([full_mfccs,mfccs_d2])
  return full_mfccs

# Hàm bổ trợ

In [None]:
def buildDataSet(dir, trim=True):
    # Filter out the wav audio files under the dir
    fileList = [f for f in os.listdir(dir) if os.path.splitext(f)[1] == '.wav']
    dataset = {}
    for fileName in fileList:
        tmp = fileName.split('.')[0]
        label = tmp.split('_')[1]

        # label = filename.split('_')[0]
        feature = full_mfcc_from_file(dir+fileName, trim)
        if label not in dataset.keys():
            dataset[label] = []
            dataset[label].append(feature)
        else:
            exist_feature = dataset[label]
            exist_feature.append(feature)
            dataset[label] = exist_feature
    return dataset

def makeTrainTestDir(mainFolder, filenames, rate):
    paths = [f'{mainFolder}',
             f'{mainFolder}/train_audio',
             f'{mainFolder}/test_audio']

    for path in paths:
        try:
            os.mkdir(path)
        except:
            rmtree(path)
            os.mkdir(path)

    random.seed(1)
    random.shuffle(filenames)
    splitPoint = int(len(filenames)*rate)
    trainFilenames = filenames[:splitPoint]
    testFilenames = filenames[splitPoint:]

    for filename in trainFilenames:
        copyfile(filename[0], f"{paths[1]}/{filename[1]}")

    for filename in testFilenames:
        copyfile(filename[0], f"{paths[2]}/{filename[1]}")

# Setup thư mục chứa data

In [None]:
def formatFilenameFSDD(dir):
    filenames = []
    count = 0
    for filename in os.listdir(dir):
        tmp = str(count) + '_' + filename.split('_')[0] + '.wav'
        filenames.append((f"{dir}/{filename}", tmp))
        count += 1
    return filenames

filenames = formatFilenameFSDD('./dataset_2_FSDD')
makeTrainTestDir('fsdd', filenames, fsdd_split)

In [None]:
def formatFilenameWolfram(dir):
    filenames = []
    count = 0
    folders = os.listdir(dir)
    for folder in folders:
        for filename in os.listdir(f'{dir}/{folder}'):
            tmp = str(count) + '_' + folder.split('_')[0] + '.wav'
            filenames.append((f"{dir}/{folder}/{filename}", tmp))
            count += 1

    return filenames

filenames = formatFilenameWolfram('./dataset_1_wolfram')
makeTrainTestDir('wolfram', filenames, wolfram_split)

# Huấn luyện mô hình (có thể bỏ qua vì mô hình đã save trên drive)

## Hàm train mô hình

In [None]:
def Generate_DiagGMM(full_fset, n_features, n_states=5, n_cmps=3):
  dists = []
  for state_i in range(n_states):
    if n_cmps > 1:
        mixtures = []
        for cmp_i in range(n_cmps):
          cmp = IndependentComponentsDistribution(tuple(
              NormalDistribution(*np.random.random(2))
              for feat_i in range(n_features)
              ))
          mixtures.append(cmp)
        comp = GeneralMixtureModel(mixtures)
    else:
        comp = IndependentComponentsDistribution(tuple(
            NormalDistribution(*np.random.random(2))
            for feat_i in range(n_features)
            ))
    dists.append(comp)

  return dists

In [None]:
# Generate progressive HMM model
def left_right_GMMHMM(seed_sample, x_dim, n_states=10, n_modals=9, diag=True, random=0):
  rng = np.random.RandomState(random)
  if random!=0:
    init_prob = lambda: rng.rand(1)[0]
    
  model = HiddenMarkovModel()
  if diag:
    states = [State(gmm, name=f"H{i}") for i, gmm in enumerate(Generate_DiagGMM(seed_sample, x_dim, n_states, n_modals))]
  else:
    states = [State( GeneralMixtureModel([MultivariateGaussianDistribution.blank(x_dim) for i in range(n_modals)]),
                    name=f"H{i}" ) for i in range(n_states)]
  model.add_states(states)
  model.add_transition(model.start, states[0], 1)
  for i in range(n_states-1):
    model.add_transition(states[i], states[i], 0.5)
    model.add_transition(states[i], states[i+1], 0.5)
  model.add_transition(states[n_states-1], states[n_states-1], 0.5)
  model.add_transition(states[n_states-1], model.end, 0.5)
  model.bake()
  return model

In [None]:
tol = 1000
def train_GMMHMM(dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True, failed_label_return=False, report_fail=None):
    GMMHMM_Models = dict()

    for label in dataset.keys():
        print(f"Training model detect {label}")
        for i in range(tol):
          model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
          _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
          if ~np.isnan(imprv.improvements[-1]):
            break
          model = None
        if model is None and failed_label_return:
          report_fail.append(label)
        GMMHMM_Models[label] = model

    return GMMHMM_Models

In [None]:
def retrain_specific_class(hmmModels, label, dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True):
  for i in range(tol):
    model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
    _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
    if ~np.isnan(imprv.improvements[-1]):
      break
  hmmModels[label] = model
  return hmmModels

## Hàm test mô hình

In [None]:
# test model
def evaluateModel(testDataset, model):
  if (len(testDataset) == 0):
    return
  digit_clf = BayesClassifier(list(dict(sorted(model.items())).values()))
  true_cnt = 0
  total = 0
  for label in sorted(testDataset.keys()):
      features = np.array(testDataset[label], dtype='object')
      pred = digit_clf.predict(features)
      iter_cnt = (pred == int(label)).sum()
      iter_total = len(features)
      total += iter_total
      true_cnt += iter_cnt
      print(f"{label}: {iter_cnt}/{iter_total} ({iter_cnt/iter_total})")
  print("Final recognition rate is %.2f"%(100.0*true_cnt/total), "%")

## Đọc và tiền xử lý data

In [None]:
# prepare data for training
master_path = 'fsdd'

trainDir = master_path + '/train_audio/'
trainDataSet_fsdd = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_fsdd = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
# prepare data for training
master_path = 'wolfram'

trainDir = master_path + '/train_audio/'
trainDataSet_wolfram = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_wolfram = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
trainDataSet = trainDataSet_fsdd
if wolfram_split > 0:
  for label in trainDataSet_fsdd.keys():
    trainDataSet[label] = trainDataSet_fsdd[label] + trainDataSet_wolfram[label]

## **Mô hình 1**
- 10 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
# Kiểm tra đảm bảo chuỗi có độ dài nhỏ nhất không nhỏ hơn số state
for label in trainDataSet.keys():
  print(min(map(len, trainDataSet[label])))

10
10
10
10
10
10
10
10
10
10


In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,10,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 133613.65776905153	Time (s): 0.2925
[2] Improvement: nan	Time (s): 0.2882
Total Training Improvement: nan
Total Training Time (s): 0.8629
[1] Improvement: 140688.9087866611	Time (s): 0.3113
[2] Improvement: 4824.386682577126	Time (s): 0.2803
[3] Improvement: nan	Time (s): 0.2932
Total Training Improvement: nan
Total Training Time (s): 1.1721
[1] Improvement: nan	Time (s): 0.2882
Total Training Improvement: nan
Total Training Time (s): 0.5728
[1] Improvement: 147250.44034880365	Time (s): 0.2767
[2] Improvement: 4148.131902101188	Time (s): 0.3017
[3] Improvement: nan	Time (s): 0.2875
Total Training Improvement: nan
Total Training Time (s): 1.1577
[1] Improvement: 163999.37637351055	Time (s): 0.3024
[2] Improvement: 3489.89294257827	Time (s): 0.3087
[3] Improvement: nan	Time (s): 0.306
Total Training Improvement: nan
Total Training Time (s): 1.2130
[1] Improvement: 129567.52403668276	Time (s): 0.3028
[2] Improvement: nan	Time (s): 0.3096
Total Trai

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 119/122 (0.9754098360655737)
1: 110/117 (0.9401709401709402)
2: 114/116 (0.9827586206896551)
3: 67/104 (0.6442307692307693)
4: 122/126 (0.9682539682539683)
5: 128/130 (0.9846153846153847)
6: 119/143 (0.8321678321678322)
7: 89/94 (0.9468085106382979)
8: 106/129 (0.8217054263565892)
9: 116/119 (0.9747899159663865)
Final recognition rate is 90.83 %
Evaluate on rest of FSDD set
0: 169/178 (0.949438202247191)
1: 178/183 (0.9726775956284153)
2: 173/184 (0.9402173913043478)
3: 101/196 (0.5153061224489796)
4: 169/174 (0.9712643678160919)
5: 164/170 (0.9647058823529412)
6: 111/157 (0.7070063694267515)
7: 194/206 (0.941747572815534)
8: 128/171 (0.7485380116959064)
9: 171/181 (0.9447513812154696)
Final recognition rate is 86.56 %
Evaluate on wolfram test set
0: 1287/2376 (0.5416666666666666)
1: 1801/2370 (0.759915611814346)
2: 1267/2373 (0.5339233038348082)
3: 936/2356 (0.39728353140916806)
4: 1326/2372 (0.5590219224283305)
5: 1707/2357 (0.7242257106491302)
6: 800/2369 (0

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_10_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 2**
- 5 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,5,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 153306.7662908206	Time (s): 0.1453
[2] Improvement: nan	Time (s): 0.1431
Total Training Improvement: nan
Total Training Time (s): 0.4410
[1] Improvement: 141412.79770113016	Time (s): 0.1479
[2] Improvement: 4243.432750862106	Time (s): 0.1466
[3] Improvement: 2106.9752734469803	Time (s): 0.1542
[4] Improvement: 713.3519424126425	Time (s): 0.151
[5] Improvement: 485.7233352471085	Time (s): 0.1418
[6] Improvement: 781.7618298441521	Time (s): 0.1421
[7] Improvement: nan	Time (s): 0.1488
Total Training Improvement: nan
Total Training Time (s): 1.1901
[1] Improvement: 142664.63651415065	Time (s): 0.1463
[2] Improvement: 3533.852563879642	Time (s): 0.1652
[3] Improvement: 2405.3453710372705	Time (s): 0.1499
[4] Improvement: 1163.3795011210896	Time (s): 0.1444
[5] Improvement: 474.8969606862811	Time (s): 0.1458
[6] Improvement: 318.12277766405896	Time (s): 0.1552
[7] Improvement: 358.759285415028	Time (s): 0.1435
[8] Improvement: 382.01221061819524	Time

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 120/122 (0.9836065573770492)
1: 108/117 (0.9230769230769231)
2: 90/116 (0.7758620689655172)
3: 75/104 (0.7211538461538461)
4: 120/126 (0.9523809523809523)
5: 127/130 (0.9769230769230769)
6: 138/143 (0.965034965034965)
7: 89/94 (0.9468085106382979)
8: 86/129 (0.6666666666666666)
9: 95/119 (0.7983193277310925)
Final recognition rate is 87.33 %
Evaluate on rest of FSDD set
0: 172/178 (0.9662921348314607)
1: 175/183 (0.9562841530054644)
2: 143/184 (0.7771739130434783)
3: 119/196 (0.6071428571428571)
4: 165/174 (0.9482758620689655)
5: 166/170 (0.9764705882352941)
6: 139/157 (0.8853503184713376)
7: 193/206 (0.9368932038834952)
8: 104/171 (0.6081871345029239)
9: 143/181 (0.7900552486187845)
Final recognition rate is 84.39 %
Evaluate on wolfram test set
0: 1277/2376 (0.5374579124579124)
1: 1577/2370 (0.6654008438818565)
2: 1121/2373 (0.4723978086809945)
3: 1094/2356 (0.46434634974533107)
4: 1027/2372 (0.43296795952782463)
5: 1611/2357 (0.6834959694526941)
6: 569/2369 (

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_5_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 3**
- 3 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,3,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 119054.62394297804	Time (s): 0.1049
[2] Improvement: 2987.06826514726	Time (s): 0.09264
[3] Improvement: 2758.5039996640407	Time (s): 0.1309
[4] Improvement: 1228.8065141380357	Time (s): 0.0975
[5] Improvement: 852.3898302315793	Time (s): 0.09152
[6] Improvement: 454.7878057392809	Time (s): 0.09848
[7] Improvement: 416.09007217994076	Time (s): 0.09317
[8] Improvement: 604.7642183359712	Time (s): 0.1136
[9] Improvement: 1016.5439606706641	Time (s): 0.09854
[10] Improvement: 625.3444812002272	Time (s): 0.08771
[11] Improvement: 70.38333603217325	Time (s): 0.1005
[12] Improvement: 21.23152227398532	Time (s): 0.08308
[13] Improvement: 24.399538906276575	Time (s): 0.092
[14] Improvement: 49.548532987362705	Time (s): 0.09568
[15] Improvement: 56.80452591426729	Time (s): 0.09112
[16] Improvement: 60.63731000754342	Time (s): 0.08968
[17] Improvement: 108.86909035957069	Time (s): 0.08877
[18] Improvement: 216.53511274413904	Time (s): 0.08205
[19] Improve

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 111/122 (0.9098360655737705)
1: 111/117 (0.9487179487179487)
2: 89/116 (0.7672413793103449)
3: 94/104 (0.9038461538461539)
4: 80/126 (0.6349206349206349)
5: 116/130 (0.8923076923076924)
6: 91/143 (0.6363636363636364)
7: 91/94 (0.9680851063829787)
8: 118/129 (0.9147286821705426)
9: 100/119 (0.8403361344537815)
Final recognition rate is 83.42 %
Evaluate on rest of FSDD set
0: 158/178 (0.8876404494382022)
1: 177/183 (0.9672131147540983)
2: 134/184 (0.7282608695652174)
3: 139/196 (0.7091836734693877)
4: 97/174 (0.5574712643678161)
5: 158/170 (0.9294117647058824)
6: 99/157 (0.6305732484076433)
7: 192/206 (0.9320388349514563)
8: 140/171 (0.8187134502923976)
9: 141/181 (0.7790055248618785)
Final recognition rate is 79.72 %
Evaluate on wolfram test set
0: 1044/2376 (0.4393939393939394)
1: 1745/2370 (0.7362869198312236)
2: 859/2373 (0.3619890434049726)
3: 1029/2356 (0.4367572156196944)
4: 649/2372 (0.2736087689713322)
5: 1493/2357 (0.6334323292320747)
6: 324/2369 (0.136

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_3_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()