<a href="https://colab.research.google.com/github/vlozg/speech_hmm/blob/main/Test_DiagHMM_013.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LƯU Ý: NOTEBOOK NÀY CHỈ DÙNG ĐỂ SHOW KẾT QUẢ CHẠY, KHÔNG NÊN CHẠY LẠI NOTEBOOK NÀY 
(VÌ TRONG NÀY CÓ CODE LƯU LẠI PRETRAINED MODEL LÊN DRIVE SẼ BỊ XÓA)

# Speech to text with HMM

- **Bài toán**: Chuyển giọng nói thành văn bản
    - **Input**: Đoạn ghi âm chứa nội dung là các số từ 0 đến 9
    - **Output**: Phân lớp của đoạn ghi âm

# Các biến thiết lập cho thử nghiệm

In [None]:
n_mfcc_ceptrum = 20
n_delta_features = 1
n_mixtures = 1
fsdd_split = 0.3
wolfram_split = 0.3
experiment_id = '013'

In [None]:
n_mfcc_features = n_mfcc_ceptrum * (1+n_delta_features)
n_mfcc_features

40

# Import và cài đặt thư viện

In [None]:
# cài lib. note: cài xong phải restart runtime
!pip install pydub
!pip install pomegranate



In [None]:
# Xác thực google để upload/download qua google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Quản lý file, folder
import os
from shutil import copyfile, rmtree
import random

# Xử lý audio
import librosa
import librosa.display
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pomegranate # Thư viện cho mô hình xác suất
from pomegranate import *

# Tải dữ liệu và lấy xác thực Google

Dữ liệu dùng để huấn luyện và đánh giá, còn xác thực google thì dùng để upload/download mô hình trên drive.

In [None]:
%%capture
# download wolfram
if not os.path.isfile('./dataset_1_wolfram.zip'):
  !gdown --id 115tIAitBNeJC0DwrP-ZyJ6RS3TyWN0qD
  !unzip -o dataset_1_wolfram.zip

# dowload FSDD
if not os.path.isfile('./dataset_2_FSDD.zip'):
  !gdown --id 1Ua9zlPBc0Fv4xGHSQTb7eIvUh_dqFI6P
  !unzip -o dataset_2_FSDD.zip

# download self recorded audio
!gdown --id 1lH_k1AYMVlJvodtZdD7OK2zkdPXxlW9i

In [None]:
# Lấy xác thực google để upload/download file
auth.authenticate_user()
gauth =  GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Hàm xử lý âm thanh

In [None]:
def scaleAddNoise(wave):
  return ((wave - wave.mean()) / wave.std()) + 0.2*np.random.normal(size=wave.shape)

def read_process_audio(full_audio_path):
  # Bắt buộc âm thanh đọc vào phải cùng sample rate
  sample_rate, wave =  wavfile.read(full_audio_path)
  if sample_rate != 16000:
    wave, sample_rate =  librosa.load(full_audio_path, sr=16000) # Hàm đọc của librosa quá chậm, scipy nhanh gấp 5-10 lần
  else:
    wave=wave/32768 # Chuẩn hóa về số thực
  wave = np.pad(wave, (2000,2000), 'constant', constant_values=(0.0,0.0))
  wave = scaleAddNoise(wave)
  return wave, sample_rate

def unvoiced_frame(wave, sample_rate, min_len = 10):
  rms = librosa.feature.rms(y=wave)[0]
  r_normalized = (rms - np.mean(rms)) / np.std(rms)
  p = np.exp(r_normalized) / (1 + np.exp(r_normalized))
  thresh = 0.4
  slice_ = p > thresh
  while (slice_.sum() < 10):
    thresh-=0.05
    slice_ = p > thresh
  return slice_

def extract_mfcc(wave, sample_rate, trim=True):
  S = librosa.feature.melspectrogram(y=wave, sr=sample_rate, n_mels=40)
  if trim:
    S = S[:,unvoiced_frame(wave, sample_rate)]
  mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfcc_ceptrum+2, lifter=40)[2:,:].T/800
  return mfccs

def mfcc_delta_features(mfcc, order):
  if order==0:
    return mfcc
  dmfcc = librosa.feature.delta(mfcc, order=order)
  return dmfcc

def full_mfcc_from_file(full_audio_path, trim=True):
  wave, sr = read_process_audio(full_audio_path)
  mfccs = extract_mfcc(wave, sr, trim)
  full_mfccs = mfccs
  if n_delta_features >= 1:
    mfccs_d1 = mfcc_delta_features(mfccs, 1)
    full_mfccs = np.hstack([full_mfccs,mfccs_d1])
  if n_delta_features >= 2:
    mfccs_d2 = mfcc_delta_features(mfccs, 2)
    full_mfccs = np.hstack([full_mfccs,mfccs_d2])
  return full_mfccs

# Hàm bổ trợ

In [None]:
def buildDataSet(dir, trim=True):
    # Filter out the wav audio files under the dir
    fileList = [f for f in os.listdir(dir) if os.path.splitext(f)[1] == '.wav']
    dataset = {}
    for fileName in fileList:
        tmp = fileName.split('.')[0]
        label = tmp.split('_')[1]

        # label = filename.split('_')[0]
        feature = full_mfcc_from_file(dir+fileName, trim)
        if label not in dataset.keys():
            dataset[label] = []
            dataset[label].append(feature)
        else:
            exist_feature = dataset[label]
            exist_feature.append(feature)
            dataset[label] = exist_feature
    return dataset

def makeTrainTestDir(mainFolder, filenames, rate):
    paths = [f'{mainFolder}',
             f'{mainFolder}/train_audio',
             f'{mainFolder}/test_audio']

    for path in paths:
        try:
            os.mkdir(path)
        except:
            rmtree(path)
            os.mkdir(path)

    random.seed(1)
    random.shuffle(filenames)
    splitPoint = int(len(filenames)*rate)
    trainFilenames = filenames[:splitPoint]
    testFilenames = filenames[splitPoint:]

    for filename in trainFilenames:
        copyfile(filename[0], f"{paths[1]}/{filename[1]}")

    for filename in testFilenames:
        copyfile(filename[0], f"{paths[2]}/{filename[1]}")

# Setup thư mục chứa data

In [None]:
def formatFilenameFSDD(dir):
    filenames = []
    count = 0
    for filename in os.listdir(dir):
        tmp = str(count) + '_' + filename.split('_')[0] + '.wav'
        filenames.append((f"{dir}/{filename}", tmp))
        count += 1
    return filenames

filenames = formatFilenameFSDD('./dataset_2_FSDD')
makeTrainTestDir('fsdd', filenames, fsdd_split)

In [None]:
def formatFilenameWolfram(dir):
    filenames = []
    count = 0
    folders = os.listdir(dir)
    for folder in folders:
        for filename in os.listdir(f'{dir}/{folder}'):
            tmp = str(count) + '_' + folder.split('_')[0] + '.wav'
            filenames.append((f"{dir}/{folder}/{filename}", tmp))
            count += 1

    return filenames

filenames = formatFilenameWolfram('./dataset_1_wolfram')
makeTrainTestDir('wolfram', filenames, wolfram_split)

# Huấn luyện mô hình (có thể bỏ qua vì mô hình đã save trên drive)

## Hàm train mô hình

In [None]:
def Generate_DiagGMM(full_fset, n_features, n_states=5, n_cmps=3):
  means = np.mean(full_fset, axis=0)
  stds = np.std(full_fset, axis=0)
  # initial values for all gaussian components
  np.random.seed(None)
  dist_init = np.random.random((n_states, n_cmps, n_features, 2))
  dist_init[..., 0] -= 0.5  # center means to 0.0
  #for feat_i in range(n_features):
      # random init mean in range [-std, std)
      #dist_init[..., feat_i, 0] *= 2 * stds[feat_i]
      #dist_init[..., feat_i, 0] += means[feat_i]
      # random init std in range 1std/n_components
      #dist_init[..., feat_i, 1] *= stds[feat_i] / n_cmps

  dists = tuple(
      GeneralMixtureModel(list(
          IndependentComponentsDistribution(tuple(
              NormalDistribution(*dist_init[state_i, cmp_i, feat_i, :])
              for feat_i in range(n_features)
          ))
          for cmp_i in range(n_cmps)
      ))
      if n_cmps > 1 else
      IndependentComponentsDistribution(tuple(
          NormalDistribution(*dist_init[state_i, 0, feat_i, :])
          for feat_i in range(n_features)
      ))
      for state_i in range(n_states)
  )
  return dists

In [None]:
# Generate progressive HMM model
def left_right_GMMHMM(seed_sample, x_dim, n_states=10, n_modals=9, random=0):
  rng = np.random.RandomState(random)
  if random!=0:
    init_prob = lambda: rng.rand(1)[0]
    
  model = HiddenMarkovModel()
  states = [ ]
  for i, state_model in enumerate(Generate_DiagGMM(seed_sample, x_dim, n_states, n_modals)):
    state = State( state_model, name=f"H{i}" )
    model.add_state(state)
    states.append(state)
  model.add_transition(model.start, states[0], 1)
  for i in range(n_states-1):
    model.add_transition(states[i], states[i], 0.5)
    model.add_transition(states[i], states[i+1], 0.5)
  model.add_transition(states[n_states-1], states[n_states-1], 0.5)
  model.add_transition(states[n_states-1], model.end, 0.5)
  model.bake()
  return model

In [None]:
def train_GMMHMM(dataset, input_dim, n_hidden_state, n_gauss_modal):
    GMMHMM_Models = dict()

    for label in dataset.keys():
        print(f"Training model detect {label}")
        model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal)
        model.fit(dataset[label], verbose=True, multiple_check_input=False)  # get optimal parameters
        GMMHMM_Models[label] = model

    return GMMHMM_Models

## Hàm test mô hình

In [None]:
# test model
def evaluateModel(testDataset, model):
  if (len(testDataset) == 0):
    return
  digit_clf = BayesClassifier(list(dict(sorted(model.items())).values()))
  true_cnt = 0
  total = 0
  for label in sorted(testDataset.keys()):
      features = np.array(testDataset[label], dtype='object')
      pred = digit_clf.predict(features)
      iter_cnt = (pred == int(label)).sum()
      iter_total = len(features)
      total += iter_total
      true_cnt += iter_cnt
      print(f"{label}: {iter_cnt}/{iter_total} ({iter_cnt/iter_total})")
  print("Final recognition rate is %.2f"%(100.0*true_cnt/total), "%")

## Đọc và tiền xử lý data

In [None]:
# prepare data for training
master_path = 'fsdd'

trainDir = master_path + '/train_audio/'
trainDataSet_fsdd = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_fsdd = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
# prepare data for training
master_path = 'wolfram'

trainDir = master_path + '/train_audio/'
trainDataSet_wolfram = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet_wolfram = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


In [None]:
trainDataSet = trainDataSet_fsdd
if wolfram_split > 0:
  for label in trainDataSet_fsdd.keys():
    trainDataSet[label] = trainDataSet_fsdd[label] + trainDataSet_wolfram[label]

## **Mô hình 1**
- 10 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
for label in trainDataSet.keys():
  print(min(map(len, trainDataSet[label])))

10
10
10
10
10
10
10
10
10
10


In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,10,n_mixtures)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 1
[1] Improvement: 20478931.559706055	Time (s): 0.345
[2] Improvement: 14168.130525864311	Time (s): 0.3636
[3] Improvement: 5056.998139132396	Time (s): 0.3651
[4] Improvement: 2973.599065323244	Time (s): 0.3457
[5] Improvement: 1796.935282592778	Time (s): 0.3598
[6] Improvement: 1375.9951692214236	Time (s): 0.3739
[7] Improvement: 820.5307753580855	Time (s): 0.3453
[8] Improvement: 240.08200422266964	Time (s): 0.348
[9] Improvement: 174.63922265230212	Time (s): 0.3488
[10] Improvement: 113.49629785795696	Time (s): 0.3506
[11] Improvement: 399.9217955148779	Time (s): 0.361
[12] Improvement: 953.3632163577713	Time (s): 0.3382
[13] Improvement: 408.499326767982	Time (s): 0.3559
[14] Improvement: 113.20571220910642	Time (s): 0.3613
[15] Improvement: 38.7025635590544	Time (s): 0.3481
[16] Improvement: 29.91300344571937	Time (s): 0.357
[17] Improvement: 48.022501484141685	Time (s): 0.3576
[18] Improvement: 44.18887689849362	Time (s): 0.3516
[19] Improvement: 41.26145169

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
1: (600/837)
2: (1061/1582)
6: (1689/2428)
8: (2234/3223)
0: (2901/4018)
7: (3480/4807)
9: (4071/5591)
5: (4743/6403)
3: (5293/7196)
4: (5892/7999)
Final recognition rate is 73.66 %
Evaluate on rest of FSDD set
0: (181/211)
1: (317/419)
3: (434/637)
9: (560/839)
7: (713/1073)
4: (887/1287)
5: (1083/1490)
8: (1190/1687)
2: (1311/1901)
6: (1417/2100)
Final recognition rate is 67.48 %
Evaluate on wolfram test set
4: (1188/1655)
8: (2331/3315)
1: (3464/4940)
2: (4518/6654)
6: (5821/8278)
0: (7185/9948)
3: (8273/11593)
7: (9524/13247)
5: (10860/14889)
9: (12140/16567)
Final recognition rate is 73.28 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_10_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 2**
- 5 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,5,n_mixtures)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 1
[1] Improvement: 2876580.6130467383	Time (s): 0.1885
[2] Improvement: 14758.788750157575	Time (s): 0.1876
[3] Improvement: 8046.504376378609	Time (s): 0.1966
[4] Improvement: 3950.0929279159755	Time (s): 0.2118
[5] Improvement: 3299.459992577904	Time (s): 0.2115
[6] Improvement: 3763.804006489576	Time (s): 0.2007
[7] Improvement: 2448.087964039645	Time (s): 0.194
[8] Improvement: 1507.763133393135	Time (s): 0.2018
[9] Improvement: 525.1382276046788	Time (s): 0.203
[10] Improvement: 155.6356644919142	Time (s): 0.2017
[11] Improvement: 46.76211839646567	Time (s): 0.2058
[12] Improvement: 12.227057698532008	Time (s): 0.1967
[13] Improvement: 4.395454776124097	Time (s): 0.2094
[14] Improvement: 2.2847922288347036	Time (s): 0.1923
[15] Improvement: 1.4695751704275608	Time (s): 0.2137
[16] Improvement: 776.4992896548938	Time (s): 0.1899
[17] Improvement: 6396.362451163819	Time (s): 0.1956
[18] Improvement: 1616.3879695897922	Time (s): 0.1917
[19] Improvement: 209.9011

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
1: (529/837)
2: (779/1582)
6: (1435/2428)
8: (1958/3223)
0: (2597/4018)
7: (3135/4807)
9: (3688/5591)
5: (4350/6403)
3: (4949/7196)
4: (5561/7999)
Final recognition rate is 69.52 %
Evaluate on rest of FSDD set
0: (201/211)
1: (288/419)
3: (433/637)
9: (566/839)
7: (724/1073)
4: (901/1287)
5: (1094/1490)
8: (1230/1687)
2: (1287/1901)
6: (1412/2100)
Final recognition rate is 67.24 %
Evaluate on wolfram test set
4: (1232/1655)
8: (2252/3315)
1: (3308/4940)
2: (3855/6654)
6: (5154/8278)
0: (6478/9948)
3: (7664/11593)
7: (8813/13247)
5: (10139/14889)
9: (11303/16567)
Final recognition rate is 68.23 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_5_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 3**
- 3 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,3,n_mixtures)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 1
[1] Improvement: 1495462.0770489196	Time (s): 0.1453
[2] Improvement: 8875.651223908062	Time (s): 0.1284
[3] Improvement: 3344.8654313499574	Time (s): 0.1353
[4] Improvement: 2361.3712463395204	Time (s): 0.1382
[5] Improvement: 2009.8475089024287	Time (s): 0.1597
[6] Improvement: 1463.1104154450586	Time (s): 0.1282
[7] Improvement: 1413.5905153042404	Time (s): 0.1469
[8] Improvement: 1256.3467483988497	Time (s): 0.1291
[9] Improvement: 1038.2746672614012	Time (s): 0.1319
[10] Improvement: 661.8676799485693	Time (s): 0.1393
[11] Improvement: 559.8382111227838	Time (s): 0.1422
[12] Improvement: 470.91489787376486	Time (s): 0.1378
[13] Improvement: 543.7067918528337	Time (s): 0.1437
[14] Improvement: 614.6394524568459	Time (s): 0.1333
[15] Improvement: 895.7862075380981	Time (s): 0.1408
[16] Improvement: 1044.0526873695198	Time (s): 0.1349
[17] Improvement: 956.230963386246	Time (s): 0.1307
[18] Improvement: 852.0133388234535	Time (s): 0.13
[19] Improvement: 867.80

In [None]:
print("Evaluate on train set")
evaluateModel(trainDataSet, hmmModels)
print("Evaluate on rest of FSDD set")
evaluateModel(testDataSet_fsdd, hmmModels)
print("Evaluate on wolfram test set")
evaluateModel(testDataSet_wolfram, hmmModels)

Evaluate on train set
1: (494/837)
2: (1002/1582)
6: (1582/2428)
8: (2104/3223)
0: (2685/4018)
7: (3177/4807)
9: (3683/5591)
5: (4307/6403)
3: (4413/7196)
4: (4957/7999)
Final recognition rate is 61.97 %
Evaluate on rest of FSDD set
0: (186/211)
1: (290/419)
3: (298/637)
9: (411/839)
7: (542/1073)
4: (698/1287)
5: (892/1490)
8: (1029/1687)
2: (1195/1901)
6: (1298/2100)
Final recognition rate is 61.81 %
Evaluate on wolfram test set
4: (1107/1655)
8: (2116/3315)
1: (3089/4940)
2: (4268/6654)
6: (5442/8278)
0: (6654/9948)
3: (6843/11593)
7: (7908/13247)
5: (9154/14889)
9: (10271/16567)
Final recognition rate is 62.00 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_3_{n_mixtures}[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()