<a href="https://colab.research.google.com/github/vlozg/speech_hmm/blob/main/Test_DiagHMM_003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LƯU Ý: NOTEBOOK NÀY CHỈ DÙNG ĐỂ SHOW KẾT QUẢ CHẠY, KHÔNG NÊN CHẠY LẠI NOTEBOOK NÀY 
(VÌ TRONG NÀY CÓ CODE LƯU LẠI PRETRAINED MODEL LÊN DRIVE SẼ BỊ XÓA)

# Speech to text with HMM

- **Bài toán**: Chuyển giọng nói thành văn bản
    - **Input**: Đoạn ghi âm chứa nội dung là các số từ 0 đến 9
    - **Output**: Phân lớp của đoạn ghi âm

# Các biến thiết lập cho thử nghiệm

In [None]:
n_mfcc_ceptrum = 12
n_delta_features = 1
fsdd_split = 1
wolfram_split = 0
experiment_id = '003'

In [None]:
n_mfcc_features = n_mfcc_ceptrum * (1+n_delta_features)
n_mfcc_features

24

# Import và cài đặt thư viện

In [None]:
# cài lib. note: cài xong phải restart runtime
!pip install pydub
!pip install pomegranate



In [None]:
# Xác thực google để upload/download qua google drive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Quản lý file, folder
import os
from shutil import copyfile, rmtree
import random

# Xử lý audio
import librosa
import librosa.display
from scipy.io import wavfile

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pomegranate # Thư viện cho mô hình xác suất
from pomegranate import *

# Tải dữ liệu và lấy xác thực Google

Dữ liệu dùng để huấn luyện và đánh giá, còn xác thực google thì dùng để upload/download mô hình trên drive.

In [None]:
%%capture
# download wolfram
if not os.path.isfile('./dataset_1_wolfram.zip'):
  !gdown --id 115tIAitBNeJC0DwrP-ZyJ6RS3TyWN0qD
  !unzip -o dataset_1_wolfram.zip

# dowload FSDD
if not os.path.isfile('./dataset_2_FSDD.zip'):
  !gdown --id 1Ua9zlPBc0Fv4xGHSQTb7eIvUh_dqFI6P
  !unzip -o dataset_2_FSDD.zip

# download self recorded audio
!gdown --id 1lH_k1AYMVlJvodtZdD7OK2zkdPXxlW9i

In [None]:
# Lấy xác thực google để upload/download file
auth.authenticate_user()
gauth =  GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Hàm xử lý âm thanh

In [None]:
def scaleAddNoise(wave):
  return ((wave - wave.mean()) / wave.std()) + 0.2*np.random.normal(size=wave.shape)

def read_process_audio(full_audio_path):
  # Bắt buộc âm thanh đọc vào phải cùng sample rate
  sample_rate, wave =  wavfile.read(full_audio_path)
  if sample_rate != 16000:
    wave, sample_rate =  librosa.load(full_audio_path, sr=16000) # Hàm đọc của librosa quá chậm, scipy nhanh gấp 5-10 lần
  else:
    wave=wave/32768 # Chuẩn hóa về số thực
  wave = np.pad(wave, (2000,2000), 'constant', constant_values=(0.0,0.0))
  wave = scaleAddNoise(wave)
  return wave, sample_rate

def extract_mfcc(wave, sample_rate):
  mfccs = librosa.feature.mfcc(y=wave, sr=sample_rate, n_mfcc=n_mfcc_ceptrum+2, n_mels=40, lifter=40)[2:,:].T/800
  return mfccs

def mfcc_delta_features(mfcc, order):
  if order==0:
    return mfcc
  dmfcc = librosa.feature.delta(mfcc, order=order)
  return dmfcc

def full_mfcc_from_file(full_audio_path):
  wave, sr = read_process_audio(full_audio_path)
  mfccs = extract_mfcc(wave, sr)
  full_mfccs = mfccs
  if n_delta_features >= 1:
    mfccs_d1 = mfcc_delta_features(mfccs, 1)
    full_mfccs = np.hstack([full_mfccs,mfccs_d1])
  if n_delta_features >= 2:
    mfccs_d2 = mfcc_delta_features(mfccs, 2)
    full_mfccs = np.hstack([full_mfccs,mfccs_d2])
  return full_mfccs

# Hàm bổ trợ

In [None]:
def buildDataSet(dir):
    # Filter out the wav audio files under the dir
    fileList = [f for f in os.listdir(dir) if os.path.splitext(f)[1] == '.wav']
    dataset = {}
    for fileName in fileList:
        tmp = fileName.split('.')[0]
        label = tmp.split('_')[1]

        # label = filename.split('_')[0]
        feature = full_mfcc_from_file(dir+fileName)
        if label not in dataset.keys():
            dataset[label] = []
            dataset[label].append(feature)
        else:
            exist_feature = dataset[label]
            exist_feature.append(feature)
            dataset[label] = exist_feature
    return dataset

def makeTrainTestDir(mainFolder, filenames, rate):
    paths = [f'{mainFolder}',
             f'{mainFolder}/train_audio',
             f'{mainFolder}/test_audio']

    for path in paths:
        try:
            os.mkdir(path)
        except:
            rmtree(path)
            os.mkdir(path)

    random.seed(1)
    random.shuffle(filenames)
    splitPoint = int(len(filenames)*rate)
    trainFilenames = filenames[:splitPoint]
    testFilenames = filenames[splitPoint:]

    for filename in trainFilenames:
        copyfile(filename[0], f"{paths[1]}/{filename[1]}")

    for filename in testFilenames:
        copyfile(filename[0], f"{paths[2]}/{filename[1]}")

# Setup thư mục chứa data

In [None]:
def formatFilenameFSDD(dir):
    filenames = []
    count = 0
    for filename in os.listdir(dir):
        tmp = str(count) + '_' + filename.split('_')[0] + '.wav'
        filenames.append((f"{dir}/{filename}", tmp))
        count += 1
    return filenames

filenames = formatFilenameFSDD('./dataset_2_FSDD')
makeTrainTestDir('fsdd', filenames, fsdd_split)

In [None]:
def formatFilenameWolfram(dir):
    filenames = []
    count = 0
    folders = os.listdir(dir)
    for folder in folders:
        for filename in os.listdir(f'{dir}/{folder}'):
            tmp = str(count) + '_' + folder.split('_')[0] + '.wav'
            filenames.append((f"{dir}/{folder}/{filename}", tmp))
            count += 1

    return filenames

filenames = formatFilenameWolfram('./dataset_1_wolfram')
makeTrainTestDir('wolfram', filenames, wolfram_split)

# Huấn luyện mô hình (có thể bỏ qua vì mô hình đã save trên drive)

## Hàm train mô hình

In [None]:
def Generate_DiagGMM(full_fset, n_features, n_states=5, n_cmps=3):
  means = np.mean(full_fset, axis=0)
  stds = np.std(full_fset, axis=0)
  # initial values for all gaussian components
  np.random.seed(None)
  dist_init = np.random.random((n_states, n_cmps, n_features, 2))
  dist_init[..., 0] -= 0.5  # center means to 0.0
  #for feat_i in range(n_features):
      # random init mean in range [-std, std)
      #dist_init[..., feat_i, 0] *= 2 * stds[feat_i]
      #dist_init[..., feat_i, 0] += means[feat_i]
      # random init std in range 1std/n_components
      #dist_init[..., feat_i, 1] *= stds[feat_i] / n_cmps

  dists = tuple(
      GeneralMixtureModel(list(
          IndependentComponentsDistribution(tuple(
              NormalDistribution(*dist_init[state_i, cmp_i, feat_i, :])
              for feat_i in range(n_features)
          ))
          for cmp_i in range(n_cmps)
      ))
      if n_cmps > 1 else
      IndependentComponentsDistribution(tuple(
          NormalDistribution(*dist_init[state_i, 0, feat_i, :])
          for feat_i in range(n_features)
      ))
      for state_i in range(n_states)
  )
  return dists

In [None]:
# Generate progressive HMM model
def left_right_GMMHMM(seed_sample, x_dim, n_states=10, n_modals=9, random=0):
  rng = np.random.RandomState(random)
  if random!=0:
    init_prob = lambda: rng.rand(1)[0]
    
  model = HiddenMarkovModel()
  states = [State( state_model, name=f"H{i}" ) for i, state_model in enumerate(Generate_DiagGMM(seed_sample, x_dim, n_states, n_modals))]
  model.add_states(states)
  model.add_transition(model.start, states[0], 1)
  for i in range(n_states-1):
    model.add_transition(states[i], states[i], 0.5)
    model.add_transition(states[i], states[i+1], 0.5)
  model.add_transition(states[n_states-1], states[n_states-1], 0.5)
  model.add_transition(states[n_states-1], model.end, 0.5)
  model.bake()
  return model

In [None]:
def train_GMMHMM(dataset, input_dim, n_hidden_state, n_gauss_modal):
    GMMHMM_Models = dict()

    for label in dataset.keys():
        print(f"Training model detect {label}")
        model = left_right_GMMHMM(dataset[label][0], input_dim, n_hidden_state, n_gauss_modal)
        model.fit(dataset[label], verbose=True, multiple_check_input=False)  # get optimal parameters
        GMMHMM_Models[label] = model

    return GMMHMM_Models

## Hàm test mô hình

In [None]:
# test model
def evaluateModel(testDataSet, model):
    if (len(testDataSet) == 0):
      return
    score_cnt = 0
    total = 0
    for label in testDataSet.keys():
        features = testDataSet[label]
        total += len(features)
        for sample in features:
            scoreList = {}
            for model_label in model.keys():
                score = model[model_label].log_probability(sample)
                scoreList[model_label] = score
            predict = max(scoreList, key=scoreList.get)
            if predict == label:
                score_cnt += 1
        print(f"{label}: ({score_cnt}/{total})")

    print("Final recognition rate is %.2f"%(100.0*score_cnt/total), "%")

## Đọc và tiền xử lý data

In [None]:
# prepare data for training
master_path = 'fsdd'

trainDir = master_path + '/train_audio/'
trainDataSet = buildDataSet(trainDir)
print("Finish prepare the training data")

# prepare data for testing
testDir = master_path + '/test_audio/'
testDataSet = buildDataSet(testDir)
print("Finish prepare the test data")

Finish prepare the training data
Finish prepare the test data


## **Mô hình 1**
- 10 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,10,1)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 0
[1] Improvement: 1085507.977875689	Time (s): 0.1207
[2] Improvement: 21670.63736847823	Time (s): 0.1142
[3] Improvement: 10434.356812377227	Time (s): 0.1272
[4] Improvement: 6075.652460476616	Time (s): 0.117
[5] Improvement: 3481.2958777504973	Time (s): 0.1266
[6] Improvement: 1719.836036804656	Time (s): 0.1165
[7] Improvement: 486.5038237683475	Time (s): 0.1158
[8] Improvement: 257.34611498832237	Time (s): 0.1375
[9] Improvement: 470.13447379058925	Time (s): 0.1168
[10] Improvement: 1758.2111946879304	Time (s): 0.1164
[11] Improvement: 2210.526662943943	Time (s): 0.1305
[12] Improvement: 538.1717384934891	Time (s): 0.1186
[13] Improvement: 144.84123421803815	Time (s): 0.1189
[14] Improvement: 58.567427345435135	Time (s): 0.12
[15] Improvement: 28.071113599988166	Time (s): 0.1331
[16] Improvement: 29.29329902830068	Time (s): 0.1232
[17] Improvement: 27.523424133774824	Time (s): 0.1155
[18] Improvement: 12.449945897853468	Time (s): 0.1179
[19] Improvement: 5.6983

In [None]:
evaluateModel(trainDataSet, hmmModels)
evaluateModel(testDataSet, hmmModels)

0: (289/300)
1: (564/600)
3: (802/900)
9: (1061/1200)
2: (1322/1500)
7: (1609/1800)
6: (1797/2100)
8: (2089/2400)
4: (2373/2700)
5: (2665/3000)
Final recognition rate is 88.83 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_10_1[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 2**
- 5 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,5,1)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 0
[1] Improvement: 1352061.0992885302	Time (s): 0.067
[2] Improvement: 29920.658146749425	Time (s): 0.07167
[3] Improvement: 3955.565207926149	Time (s): 0.08022
[4] Improvement: 2168.3975915202755	Time (s): 0.0833
[5] Improvement: 3914.662632855645	Time (s): 0.07352
[6] Improvement: 2349.5925377116073	Time (s): 0.06829
[7] Improvement: 719.739022299007	Time (s): 0.07092
[8] Improvement: 513.8422104595811	Time (s): 0.07185
[9] Improvement: 390.70234111783793	Time (s): 0.07521
[10] Improvement: 398.2286757331458	Time (s): 0.07265
[11] Improvement: 507.6466535498621	Time (s): 0.07473
[12] Improvement: 615.6030216776999	Time (s): 0.06643
[13] Improvement: 991.7962977138232	Time (s): 0.06707
[14] Improvement: 976.1698388964869	Time (s): 0.0817
[15] Improvement: 769.7503994976869	Time (s): 0.06995
[16] Improvement: 934.8044464359991	Time (s): 0.06694
[17] Improvement: 1119.6519251227146	Time (s): 0.07527
[18] Improvement: 1219.3324565070798	Time (s): 0.07759
[19] Improv

In [None]:
evaluateModel(trainDataSet, hmmModels)
evaluateModel(testDataSet, hmmModels)

0: (271/300)
1: (534/600)
3: (764/900)
9: (993/1200)
2: (1178/1500)
7: (1457/1800)
6: (1564/2100)
8: (1742/2400)
4: (1996/2700)
5: (2274/3000)
Final recognition rate is 75.80 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_5_1[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

## **Mô hình 3**
- 3 hiddent states
- Multivariate Diagonal Gauss cho emission probs

In [None]:
%%time
# train
hmmModels = train_GMMHMM(trainDataSet, n_mfcc_features,3,1)
print("Finish training of the GMM_HMM models for digits 0-9")

Training model detect 0
[1] Improvement: 952116.9911709531	Time (s): 0.0455
[2] Improvement: -4.0745362639427185e-10	Time (s): 0.0463
Total Training Improvement: 952116.9911709528
Total Training Time (s): 0.1584
Training model detect 1
[1] Improvement: 982742.7516663815	Time (s): 0.05153
[2] Improvement: 4750.036514986481	Time (s): 0.04976
[3] Improvement: 770.9778260166058	Time (s): 0.04469
[4] Improvement: 430.2690785501618	Time (s): 0.04631
[5] Improvement: 358.34904032276245	Time (s): 0.04556
[6] Improvement: 408.45727060578065	Time (s): 0.04877
[7] Improvement: 770.5155649979133	Time (s): 0.04676
[8] Improvement: 1494.903116714384	Time (s): 0.0487
[9] Improvement: 1985.7377452505752	Time (s): 0.04728
[10] Improvement: 1912.213257781812	Time (s): 0.054
[11] Improvement: 1307.9801112994319	Time (s): 0.06602
[12] Improvement: 639.3601672392106	Time (s): 0.05027
[13] Improvement: 208.3183276715572	Time (s): 0.05442
[14] Improvement: 57.21285968698794	Time (s): 0.0472
[15] Improvement:

In [None]:
evaluateModel(trainDataSet, hmmModels)
evaluateModel(testDataSet, hmmModels)

0: (0/300)
1: (287/600)
3: (306/900)
9: (320/1200)
2: (438/1500)
7: (567/1800)
6: (774/2100)
8: (1037/2400)
4: (1123/2700)
5: (1340/3000)
Final recognition rate is 44.67 %


In [None]:
# lưu lại model
for model_label in hmmModels.keys():
  file = drive.CreateFile({'title': f'hmm[{model_label}]_{n_mfcc_features}_3_1[{experiment_id}].json', 'parents': [{'id': '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'}]})
  file.SetContentString(hmmModels[model_label].to_json())
  file.Upload()

# Đọc vào model đã save

In [None]:
# download pre-trained models
folder_id  = '1QPUr4vwYHu3n9iH3iQmnvDUt2Dgx4V3Y'
file_list = drive.ListFile({'q': "'{}' in parents and trashed=false".format(folder_id)}).GetList()
for i, file1 in enumerate(sorted(file_list, key = lambda x: x['title']), start=1):
  if (file1['title'][:3] != "hmm" or f"[{experiment_id}]" not in file1['title']):
    continue
  print('Downloading {} from GDrive ({}/{})'.format(file1['title'], i, len(file_list)))
  file1.GetContentFile(file1['title'])

Downloading hmm[0]_24_10_1[003].json from GDrive (13/192)
Downloading hmm[0]_24_3_1[003].json from GDrive (15/192)
Downloading hmm[0]_24_5_1[003].json from GDrive (17/192)
Downloading hmm[1]_24_10_1[003].json from GDrive (32/192)
Downloading hmm[1]_24_3_1[003].json from GDrive (34/192)
Downloading hmm[1]_24_5_1[003].json from GDrive (36/192)
Downloading hmm[2]_24_10_1[003].json from GDrive (51/192)
Downloading hmm[2]_24_3_1[003].json from GDrive (53/192)
Downloading hmm[2]_24_5_1[003].json from GDrive (55/192)
Downloading hmm[3]_24_10_1[003].json from GDrive (70/192)
Downloading hmm[3]_24_3_1[003].json from GDrive (72/192)
Downloading hmm[3]_24_5_1[003].json from GDrive (74/192)
Downloading hmm[4]_24_10_1[003].json from GDrive (89/192)
Downloading hmm[4]_24_3_1[003].json from GDrive (91/192)
Downloading hmm[4]_24_5_1[003].json from GDrive (93/192)
Downloading hmm[5]_24_10_1[003].json from GDrive (108/192)
Downloading hmm[5]_24_3_1[003].json from GDrive (110/192)
Downloading hmm[5]_24_5

In [None]:
# Đọc model và tạo classifier `digit_clf`
# model_id có định dạng 13_<số state>_<số mixture>[số stt]
def load_model_as_clf(model_id):
  hmmModels = []
  for model_label in range(10):
    with open(f'hmm[{model_label}]_{model_id}.json','rt') as f:
      hmmModels.append(HiddenMarkovModel().from_json(f.read()))

  digit_clf = BayesClassifier(hmmModels)
  return digit_clf

# Thử model

### Kiểm thử với tập dữ liệu Wolfram

In [None]:
%%time
testDataset = buildDataSet('./wolfram/test_audio/')

CPU times: user 3min 52s, sys: 3min 14s, total: 7min 7s
Wall time: 3min 38s


In [None]:
def final_test(testDataset, digit_clf):
  true_cnt = 0
  total = 0
  for label in testDataset.keys():
      iter_cnt = 0
      features = testDataset[label]
      total += len(features)
      for sample in features:
          pred = digit_clf.predict(np.array([sample]))[0]
          if pred == int(label):
              iter_cnt += 1
      true_cnt += iter_cnt
      print(f"{label}: {iter_cnt}/{len(features)} ({iter_cnt/len(features)})")

  print("Final recognition rate is %.2f"%(100.0*true_cnt/total), "%")

In [None]:
final_test(testDataset, load_model_as_clf(f"{n_mfcc_features}_10_1[{experiment_id}]"))

4: 1230/2372 (0.5185497470489039)
8: 1160/2352 (0.4931972789115646)
7: 673/2377 (0.2831299957930164)
1: 1377/2370 (0.5810126582278481)
2: 968/2373 (0.40792246101980617)
3: 730/2356 (0.3098471986417657)
6: 1137/2369 (0.47994934571549175)
0: 1203/2376 (0.5063131313131313)
5: 1716/2357 (0.7280441238862961)
9: 1943/2364 (0.821912013536379)
Final recognition rate is 51.28 %


In [None]:
final_test(testDataset, load_model_as_clf(f"{n_mfcc_features}_5_1[{experiment_id}]"))

4: 1084/2372 (0.45699831365935917)
8: 451/2352 (0.1917517006802721)
7: 713/2377 (0.29995793016407235)
1: 1111/2370 (0.46877637130801686)
2: 616/2373 (0.25958702064896755)
3: 1006/2356 (0.42699490662139217)
6: 408/2369 (0.1722245673279865)
0: 1223/2376 (0.5147306397306397)
5: 1545/2357 (0.6554942723801442)
9: 1880/2364 (0.7952622673434856)
Final recognition rate is 42.41 %


In [None]:
final_test(testDataset, load_model_as_clf(f"{n_mfcc_features}_3_1[{experiment_id}]"))

4: 77/2372 (0.03246205733558179)
8: 1106/2352 (0.47023809523809523)
7: 154/2377 (0.06478754732856541)
1: 2112/2370 (0.8911392405063291)
2: 93/2373 (0.039190897597977246)
3: 32/2356 (0.013582342954159592)
6: 593/2369 (0.25031658927817646)
0: 1/2376 (0.00042087542087542086)
5: 365/2357 (0.15485787017394995)
9: 157/2364 (0.06641285956006768)
Final recognition rate is 19.82 %
