<a href="https://colab.research.google.com/github/vlozg/speech_hmm/blob/main/Test_DiagHMM_018.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LƯU Ý: NOTEBOOK NÀY CHỈ DÙNG ĐỂ SHOW KẾT QUẢ CHẠY, KHÔNG NÊN CHẠY LẠI NOTEBOOK NÀY 
(VÌ TRONG NÀY CÓ CODE LƯU LẠI PRETRAINED MODEL LÊN DRIVE SẼ BỊ XÓA)

# Speech to text with HMM

- **Bài toán**: Chuyển giọng nói thành văn bản
    - **Input**: Đoạn ghi âm chứa nội dung là các số từ 0 đến 9
    - **Output**: Phân lớp của đoạn ghi âm

# Các biến thiết lập cho thử nghiệm

In [None]:
n_mfcc_ceptrum = 12
n_delta_features = 1
n_mixtures = 3
fsdd_split = 0.4
wolfram_split = 0
experiment_id = '018'

In [None]:
n_mfcc_features = n_mfcc_ceptrum * (1+n_delta_features)
n_mfcc_features

24

# Import và cài đặt thư viện

In [None]:
# cài lib. note: cài xong phải restart runtime
!pip install pydub
!pip install pomegranate



In [None]:
# Xác thực google để upload/download qua google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Quản lý file, folder
import os
from shutil import copyfile, rmtree
import random

# Xử lý audio
import librosa
import librosa.display
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pomegranate # Thư viện cho mô hình xác suất
from pomegranate import *

# Tải dữ liệu và lấy xác thực Google

Dữ liệu dùng để huấn luyện và đánh giá, còn xác thực google thì dùng để upload/download mô hình trên drive.

In [None]:
%%capture
# download wolfram
if not os.path.isfile('./dataset_1_wolfram.zip'):
  !gdown --id 115tIAitBNeJC0DwrP-ZyJ6RS3TyWN0qD
  !unzip -o dataset_1_wolfram.zip

# dowload FSDD
if not os.path.isfile('./dataset_2_FSDD.zip'):
  !gdown --id 1Ua9zlPBc0Fv4xGHSQTb7eIvUh_dqFI6P
  !unzip -o dataset_2_FSDD.zip

# download self recorded audio
!gdown --id 1lH_k1AYMVlJvodtZdD7OK2zkdPXxlW9i

In [None]:
# Lấy xác thực google để upload/download file
auth.authenticate_user()
gauth =  GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Hàm xử lý âm thanh

In [None]:
def minmax_scale(wave):
  return ((wave - wave.min()) / (wave.max() - wave.min()) - 0.5)

def standard_scale(wave):
  return ((wave - wave.mean()) / wave.std())

def scaleAddNoise(wave):
  return standard_scale(wave) + 0.2*np.random.normal(size=wave.shape)

'''
  Hàm đọc audio
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_audio(full_audio_path):
  # Bắt buộc âm thanh đọc vào phải cùng sample rate là 16000
  sample_rate, wave =  wavfile.read(full_audio_path)
  if sample_rate != 16000:
    wave, sample_rate =  librosa.load(full_audio_path, sr=16000) # Hàm đọc của librosa quá chậm, scipy nhanh gấp 5-10 lần
  else:
    wave=wave/32768 # Chuẩn hóa về số thực
  return wave, sample_rate

'''
  Hàm đọc audio, có thêm khoảng trắng ở 2 đầu audio,
  scale lại waveform và thêm white noise
  
  Return:
    sample_rate(int): thường cố định là 16000
    wave(np.array): waveform
'''
def read_process_audio(full_audio_path):
  wave, sample_rate = read_audio(full_audio_path)
  wave = np.pad(wave, (2000,2000), 'constant', constant_values=(0.0,0.0))
  wave = scaleAddNoise(wave)
  return wave, sample_rate

def unvoiced_frame(wave, sample_rate, min_len = 10):
  rms = librosa.feature.rms(y=wave)[0]
  r_normalized = standard_scale(rms)
  p = np.exp(r_normalized) / (1 + np.exp(r_normalized))
  # Giảm dần threshold xuống nếu không đủ min_len để train HMM
  thresh = 0.4
  slice_ = p > thresh
  while (slice_.sum() < min_len):
    thresh-=0.05
    slice_ = p > thresh
  return slice_

def extract_mfcc(wave, sample_rate, trim=True):
  S = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_mels=40)
  if trim:
    S = S[:,unvoiced_frame(wave, sample_rate)]
  mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfcc_ceptrum+2, lifter=40)[2:,:].T/800
  return mfccs

def mfcc_delta_features(mfcc, order):
  if order==0:
    return mfcc
  dmfcc = librosa.feature.delta(mfcc, order=order)
  return dmfcc

def full_mfcc_from_file(full_audio_path, trim=True):
  wave, sr = read_process_audio(full_audio_path)
  mfccs = extract_mfcc(wave, sr, trim)
  full_mfccs = mfccs
  if n_delta_features >= 1:
    mfccs_d1 = mfcc_delta_features(mfccs, 1)
    full_mfccs = np.hstack([full_mfccs,mfccs_d1])
  if n_delta_features >= 2:
    mfccs_d2 = mfcc_delta_features(mfccs, 2)
    full_mfccs = np.hstack([full_mfccs,mfccs_d2])
  return full_mfccs

# Hàm bổ trợ

In [None]:
def buildDataSet(dir, trim=True):
    # Filter out the wav audio files under the dir
    fileList = [f for f in os.listdir(dir) if os.path.splitext(f)[1] == '.wav']
    dataset = {}
    for fileName in fileList:
        tmp = fileName.split('.')[0]
        label = tmp.split('_')[1]

        # label = filename.split('_')[0]
        feature = full_mfcc_from_file(dir+fileName, trim)
        if label not in dataset.keys():
            dataset[label] = []
            dataset[label].append(feature)
        else:
            exist_feature = dataset[label]
            exist_feature.append(feature)
            dataset[label] = exist_feature
    return dataset

def makeTrainTestDir(mainFolder, filenames, rate):
    paths = [f'{mainFolder}',
             f'{mainFolder}/train_audio',
             f'{mainFolder}/test_audio']

    for path in paths:
        try:
            os.mkdir(path)
        except:
            rmtree(path)
            os.mkdir(path)

    random.seed(1)
    random.shuffle(filenames)
    splitPoint = int(len(filenames)*rate)
    trainFilenames = filenames[:splitPoint]
    testFilenames = filenames[splitPoint:]

    for filename in trainFilenames:
        copyfile(filename[0], f"{paths[1]}/{filename[1]}")

    for filename in testFilenames:
        copyfile(filename[0], f"{paths[2]}/{filename[1]}")

# Setup thư mục chứa data

In [None]:
def formatFilenameFSDD(dir):
    filenames = []
    count = 0
    for filename in os.listdir(dir):
        tmp = str(count) + '_' + filename.split('_')[0] + '.wav'
        filenames.append((f"{dir}/{filename}", tmp))
        count += 1
    return filenames

filenames = formatFilenameFSDD('./dataset_2_FSDD')
makeTrainTestDir('fsdd', filenames, fsdd_split)

In [None]:
def formatFilenameWolfram(dir):
    filenames = []
    count = 0
    folders = os.listdir(dir)
    for folder in folders:
        for filename in os.listdir(f'{dir}/{folder}'):
            tmp = str(count) + '_' + folder.split('_')[0] + '.wav'
            filenames.append((f"{dir}/{folder}/{filename}", tmp))
            count += 1

    return filenames

filenames = formatFilenameWolfram('./dataset_1_wolfram')
makeTrainTestDir('wolfram', filenames, wolfram_split)

# Huấn luyện mô hình (có thể bỏ qua vì mô hình đã save trên drive)

## Hàm train mô hình

In [None]:
def Generate_DiagGMM(full_fset, n_features, n_states=5, n_cmps=3):
  dists = []
  for state_i in range(n_states):
    if n_cmps > 1:
        mixtures = []
        for cmp_i in range(n_cmps):
          cmp = IndependentComponentsDistribution(tuple(
              NormalDistribution(*np.random.random(2))
              for feat_i in range(n_features)
              ))
          mixtures.append(cmp)
        comp = GeneralMixtureModel(mixtures)
    else:
        comp = IndependentComponentsDistribution(tuple(
            NormalDistribution(*np.random.random(2))
            for feat_i in range(n_features)
            ))
    dists.append(comp)

  return dists

In [None]:
# Generate progressive HMM model
def left_right_GMMHMM(seed_sample, x_dim, n_states=10, n_modals=9, diag=True, random=0):
  rng = np.random.RandomState(random)
  if random!=0:
    init_prob = lambda: rng.rand(1)[0]
    
  model = HiddenMarkovModel()
  if diag:
    states = [State(gmm, name=f"H{i}") for i, gmm in enumerate(Generate_DiagGMM(seed_sample, x_dim, n_states, n_modals))]
  else:
    states = [State( GeneralMixtureModel([MultivariateGaussianDistribution.blank(x_dim) for i in range(n_modals)]),
                    name=f"H{i}" ) for i in range(n_states)]
  model.add_states(states)
  model.add_transition(model.start, states[0], 1)
  for i in range(n_states-1):
    model.add_transition(states[i], states[i], 0.5)
    model.add_transition(states[i], states[i+1], 0.5)
  model.add_transition(states[n_states-1], states[n_states-1], 0.5)
  model.add_transition(states[n_states-1], model.end, 0.5)
  model.bake()
  return model

In [None]:
tol = 1000
def train_GMMHMM(dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True, failed_label_return=False, report_fail=None):
    GMMHMM_Models = dict()

    for label in dataset.keys():
        print(f"Training model detect {label}")
        for i in range(tol):
          model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
          _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
          if ~np.isnan(imprv.improvements[-1]):
            break
          model = None
        if model is None and failed_label_return:
          report_fail.append(label)
        GMMHMM_Models[label] = model

    return GMMHMM_Models

In [None]:
def retrain_specific_class(hmmModels, label, dataset, input_dim, n_hidden_state, n_gauss_modal, diag=True):
  for i in range(tol):
    model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal, diag=diag)
    _, imprv = model.fit(dataset[label], verbose=True, multiple_check_input=False,return_history=True)  # get optimal parameters
    if ~np.isnan(imprv.improvements[-1]):
      break
  hmmModels[label] = model
  return hmmModels

## Hàm test mô hình

In [None]:
# test model
def evaluateModel(testDataset, model):
  if (len(testDataset) == 0):
    return
  digit_clf = BayesClassifier(list(dict(sorted(model.items())).values()))
  true_cnt = 0
  total = 0
  for label in sorted(testDataset.keys()):
      features = np.array(testDataset[label], dtype='object')
      pred = digit_clf.predict(features)
      iter_cnt = (pred == int(label)).sum()
      iter_total = len(features)
      total += iter_total
      true_cnt += iter_cnt
      print(f"{label}: {iter_cnt}/{iter_total} ({iter_cnt/iter_total})")
  print("Final recognition rate is %.2f"%(100.0*true_cnt/total), "%")

## Đọc và tiền xử lý data

In [None]:
# prepare data for training
master_path = 'fsdd'

trainDir = master_path + '/train_audio/'
trainDataSet_fsdd = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_fsdd = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
# prepare data for training
master_path = 'wolfram'

trainDir = master_path + '/train_audio/'
trainDataSet_wolfram = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_wolfram = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
trainDataSet = trainDataSet_fsdd
if wolfram_split > 0:
  for label in trainDataSet_fsdd.keys():
    trainDataSet[label] = trainDataSet_fsdd[label] + trainDataSet_wolfram[label]

## **Mô hình 1**
- 10 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
# Kiểm tra đảm bảo chuỗi có độ dài nhỏ nhất không nhỏ hơn số state
for label in trainDataSet.keys():
  print(min(map(len, trainDataSet[label])))

10
10
10
10
10
10
10
10
10
10


In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,10,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 223012.92915684244	Time (s): 0.1003
[2] Improvement: 594.6917095825193	Time (s): 0.1107
[3] Improvement: 294.2754099406593	Time (s): 0.09901
[4] Improvement: 264.3730702148896	Time (s): 0.1036
[5] Improvement: 216.31040015084727	Time (s): 0.1053
[6] Improvement: 120.18423231356428	Time (s): 0.1045
[7] Improvement: 18.035600331029855	Time (s): 0.1051
[8] Improvement: 3.8290075366094243	Time (s): 0.1021
[9] Improvement: 1.5809822182491189	Time (s): 0.1054
[10] Improvement: 89.50642723497003	Time (s): 0.09498
[11] Improvement: 506.22832272986125	Time (s): 0.09805
[12] Improvement: 246.4771529593272	Time (s): 0.1188
[13] Improvement: 13.633811930485535	Time (s): 0.1057
[14] Improvement: 1.4926527999341488	Time (s): 0.09434
[15] Improvement: 0.5159498854918638	Time (s): 0.09871
[16] Improvement: 0.24371048633474857	Time (s): 0.09812
[17] Improvement: 0.1470996140560601	Time (s): 0.09909
[18] Improvement: 0.10214550615637563	Time (s): 0.1205
[19] Impr

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 119/122 (0.9754098360655737)
1: 101/117 (0.8632478632478633)
2: 99/116 (0.853448275862069)
3: 74/104 (0.7115384615384616)
4: 121/126 (0.9603174603174603)
5: 127/130 (0.9769230769230769)
6: 116/143 (0.8111888111888111)
7: 87/94 (0.925531914893617)
8: 111/129 (0.8604651162790697)
9: 101/119 (0.8487394957983193)
Final recognition rate is 88.00 %
Evaluate on rest of FSDD set
0: 171/178 (0.9606741573033708)
1: 167/183 (0.912568306010929)
2: 154/184 (0.8369565217391305)
3: 123/196 (0.6275510204081632)
4: 166/174 (0.9540229885057471)
5: 167/170 (0.9823529411764705)
6: 115/157 (0.732484076433121)
7: 194/206 (0.941747572815534)
8: 139/171 (0.8128654970760234)
9: 159/181 (0.8784530386740331)
Final recognition rate is 86.39 %
Evaluate on wolfram test set
0: 1377/2376 (0.5795454545454546)
1: 1626/2370 (0.6860759493670886)
2: 1211/2373 (0.5103244837758112)
3: 1108/2356 (0.4702886247877759)
4: 1385/2372 (0.5838954468802698)
5: 1744/2357 (0.7399236317352567)
6: 617/2369 (0.26

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_10_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 2**
- 5 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,5,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 274047.3740660098	Time (s): 0.05055
[2] Improvement: 619.2435275646567	Time (s): 0.05116
[3] Improvement: nan	Time (s): 0.0645
Total Training Improvement: nan
Total Training Time (s): 0.2202
[1] Improvement: 135175.67821390368	Time (s): 0.04985
[2] Improvement: nan	Time (s): 0.04967
Total Training Improvement: nan
Total Training Time (s): 0.1507
[1] Improvement: 180886.24742034788	Time (s): 0.06292
[2] Improvement: 284.38208101992495	Time (s): 0.05055
[3] Improvement: 1448.8909827057796	Time (s): 0.05126
[4] Improvement: 1299.1390913276264	Time (s): 0.04981
[5] Improvement: 959.483162744029	Time (s): 0.05777
[6] Improvement: 618.4890177485358	Time (s): 0.05075
[7] Improvement: 478.4538733809022	Time (s): 0.04979
[8] Improvement: 378.83825373181025	Time (s): 0.04955
[9] Improvement: 185.4201610066375	Time (s): 0.06622
[10] Improvement: 141.12492339157325	Time (s): 0.05548
[11] Improvement: 152.3230243986909	Time (s): 0.0504
[12] Improvement: 147.

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 119/122 (0.9754098360655737)
1: 95/117 (0.811965811965812)
2: 106/116 (0.9137931034482759)
3: 79/104 (0.7596153846153846)
4: 117/126 (0.9285714285714286)
5: 126/130 (0.9692307692307692)
6: 106/143 (0.7412587412587412)
7: 79/94 (0.8404255319148937)
8: 109/129 (0.8449612403100775)
9: 103/119 (0.865546218487395)
Final recognition rate is 86.58 %
Evaluate on rest of FSDD set
0: 170/178 (0.9550561797752809)
1: 167/183 (0.912568306010929)
2: 160/184 (0.8695652173913043)
3: 121/196 (0.6173469387755102)
4: 164/174 (0.9425287356321839)
5: 165/170 (0.9705882352941176)
6: 101/157 (0.643312101910828)
7: 181/206 (0.8786407766990292)
8: 141/171 (0.8245614035087719)
9: 156/181 (0.861878453038674)
Final recognition rate is 84.78 %
Evaluate on wolfram test set
0: 1217/2376 (0.5122053872053872)
1: 1800/2370 (0.759493670886076)
2: 1091/2373 (0.45975558364938895)
3: 1067/2356 (0.45288624787775894)
4: 1323/2372 (0.5577571669477235)
5: 1633/2357 (0.6928298684768774)
6: 487/2369 (0.2

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_5_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 3**
- 3 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
failed_label = []
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,3,n_mixtures, failed_label_return=True, report_fail=failed_label)
print("Finish training of the GMM_HMM models for digits 0-9")
print(failed_label)
assert len(failed_label) == 0

Training model detect 1
[1] Improvement: 218813.76963360823	Time (s): 0.03119
[2] Improvement: 606.0439601047547	Time (s): 0.03194
[3] Improvement: 65.38284167967504	Time (s): 0.03165
[4] Improvement: 37.31934196011571	Time (s): 0.03194
[5] Improvement: 47.071035076878616	Time (s): 0.03188
[6] Improvement: 67.08749546229956	Time (s): 0.0372
[7] Improvement: 95.71889893004845	Time (s): 0.03277
[8] Improvement: 123.07334782886028	Time (s): 0.03168
[9] Improvement: 93.1448066404555	Time (s): 0.03596
[10] Improvement: 54.499463101878064	Time (s): 0.03395
[11] Improvement: 22.724161553982412	Time (s): 0.0335
[12] Improvement: 21.802786669533816	Time (s): 0.03218
[13] Improvement: 27.915769897939754	Time (s): 0.03453
[14] Improvement: 500.6327404252952	Time (s): 0.03821
[15] Improvement: 804.8629317212908	Time (s): 0.0328
[16] Improvement: 445.20820364741667	Time (s): 0.03283
[17] Improvement: 321.64991144226224	Time (s): 0.03175
[18] Improvement: 216.61871018944657	Time (s): 0.03191
[19] Im

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
0: 115/122 (0.9426229508196722)
1: 82/117 (0.7008547008547008)
2: 96/116 (0.8275862068965517)
3: 25/104 (0.2403846153846154)
4: 117/126 (0.9285714285714286)
5: 126/130 (0.9692307692307692)
6: 48/143 (0.3356643356643357)
7: 76/94 (0.8085106382978723)
8: 129/129 (1.0)
9: 87/119 (0.7310924369747899)
Final recognition rate is 75.08 %
Evaluate on rest of FSDD set
0: 164/178 (0.9213483146067416)
1: 139/183 (0.7595628415300546)
2: 156/184 (0.8478260869565217)
3: 33/196 (0.1683673469387755)
4: 166/174 (0.9540229885057471)
5: 166/170 (0.9764705882352941)
6: 64/157 (0.40764331210191085)
7: 175/206 (0.8495145631067961)
8: 168/171 (0.9824561403508771)
9: 137/181 (0.7569060773480663)
Final recognition rate is 76.00 %
Evaluate on wolfram test set
0: 989/2376 (0.41624579124579125)
1: 1656/2370 (0.6987341772151898)
2: 1274/2373 (0.5368731563421829)
3: 450/2356 (0.19100169779286927)
4: 1136/2372 (0.47892074198988194)
5: 1673/2357 (0.7098005939753924)
6: 366/2369 (0.154495567750105

In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_3_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()