<a href="https://colab.research.google.com/github/tajtu1406/BT2-N15/blob/master/Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
import librosa, librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as idsp

# MFCC Feature calculation

In [27]:
import os
import numpy as np
import librosa


def mfcc(audio_path):
    y, sr = librosa.load(audio_path)
    # MFCCs
    mfccs = librosa.feature.mfcc(y = y, sr = sr, n_mfcc = 13)
    ans = [mfccs]
    # Compute the first MFCCs derivatives
    delta_mfccs = librosa.feature.delta(mfccs, order = 1, mode ='nearest')
    ans.append(delta_mfccs)
    # Compute the second MFCCs derivatives
    delta2_mfccs = librosa.feature.delta(mfccs, order = 2, mode ='nearest')
    ans.append(delta2_mfccs) 
    return np.transpose(np.concatenate(ans, axis = 0),[1,0])

# DTW Implementation 
### Averaging trains template

In [4]:
from math import sqrt

def calculate_Euclid_distance(A, B):
    return sqrt(sum([(a - b) ** 2 for (a, b) in zip(A, B)]))


def dtw(M1, M2):
    # length of two sequences
    M1_len = len(M1)
    M2_len = len(M2)
    cost_0 = np.zeros((M1_len + 1, M2_len + 1))
    cost_0[0, 1:] = np.inf
    cost_0[1:, 0] = np.inf
    # Initialize the array size to M1_len * M2_len
    cost = cost_0[1:, 1:]
    for i in range(M1_len):
        for j in range(M2_len):
            cost[i, j] = calculate_Euclid_distance(M1[i], M2[j])
    # DP to calculate cost matrix
    for i in range(M1_len):
        for j in range(M2_len):
            cost[i, j] += min([cost_0[i, j], \
                               cost_0[min(i + 1, M1_len - 1), j], \
                               cost_0[i, min(j + 1, M2_len - 1)]])

    # calculate the warp path
    if len(M1) == 1:
        path = np.zeros(len(M2)), range(len(M2))
    elif len(M2) == 1:
        path = range(len(M1)), np.zeros(len(M1))
    else:
        i, j = np.array(cost_0.shape) - 2
        path_1, path_2 = [i], [j]
        while (i > 0) or (j > 0):
            arg_min = np.argmin((cost_0[i, j], cost_0[i, j + 1], cost_0[i + 1, j]))
            if arg_min == 0:
                i -= 1
                j -= 1
            elif arg_min == 1:
                i -= 1
            else:
                j -= 1
            path_1.insert(0, i)
            path_2.insert(0, j)
        path = np.array(path_1), np.array(path_2)
    # the minimum distance is the normalized distance
    return cost[-1, -1] / sum(cost.shape), path


def train_model_dtw(train_dir):
    model = []
    for label in ["len", "xuong", "trai", "phai", "a", "b", "nhay", "ban"]:
        label_dir = os.path.join(train_dir, str(label))
        file_list = os.listdir(label_dir)
        mfcc_list = []

        # read input files and calculate MFCC features
        for j in range(len(file_list)):
            file_path = os.path.join(label_dir, file_list[j])
            if file_path[-4:] == ".wav":
              file_mfcc = mfcc(file_path)
              mfcc_list.append(file_mfcc)

        # set the first sequence as master
        mfcc_count = np.zeros(len(mfcc_list[0]))
        mfcc_all = np.zeros(mfcc_list[0].shape)
        for i in range(len(mfcc_list)):
            # calculate the wrap path between master and each template
            _, path = dtw(mfcc_list[0], mfcc_list[i])
            for j in range(len(path[0])):
                mfcc_count[int(path[0][j])] += 1
                mfcc_all[int(path[0][j])] += mfcc_list[i][path[1][j]]

        # Generalization by averaging the templates
        final_mfcc = np.zeros(mfcc_all.shape)
        for i in range(len(mfcc_count)):
            for j in range(len(mfcc_all[i])):
                final_mfcc[i][j] = mfcc_all[i][j] / mfcc_count[i]
        model.append(final_mfcc)
    return model


def predict_dtw(model, file_path):
    mfcc_feat = mfcc(file_path)
    result = 0
    min_dist, _ = dtw(model[0], mfcc_feat)
    label = ["len", "xuong", "trai", "phai", "a", "b", "nhay", "ban"]
    for i in range(len(model)):
        dist, _ = dtw(model[i], mfcc_feat)
        if dist < min_dist:
            result = i
            min_dist = dist
    return label[result]

In [28]:
train_dir = "/content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/trainDTW"

model = train_model_dtw(train_dir)

In [29]:
pred_dir = "/content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/test/file128.wav"
sound, sr = librosa.load(pred_dir)

pred = predict_dtw(model, pred_dir)
print(pred)
idsp.Audio(data=sound, rate=sr)

nhay


# HMM

In [10]:
!pip install hmmlearn

Collecting hmmlearn
  Downloading hmmlearn-0.2.7-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (129 kB)
[?25l[K     |██▌                             | 10 kB 19.2 MB/s eta 0:00:01[K     |█████                           | 20 kB 26.2 MB/s eta 0:00:01[K     |███████▋                        | 30 kB 14.9 MB/s eta 0:00:01[K     |██████████                      | 40 kB 6.1 MB/s eta 0:00:01[K     |████████████▋                   | 51 kB 6.0 MB/s eta 0:00:01[K     |███████████████▏                | 61 kB 7.1 MB/s eta 0:00:01[K     |█████████████████▊              | 71 kB 7.6 MB/s eta 0:00:01[K     |████████████████████▏           | 81 kB 6.8 MB/s eta 0:00:01[K     |██████████████████████▊         | 92 kB 7.6 MB/s eta 0:00:01[K     |█████████████████████████▎      | 102 kB 6.9 MB/s eta 0:00:01[K     |███████████████████████████▊    | 112 kB 6.9 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 122 kB 6.9 MB/s eta 0:00:01[K     |██████████████████████

In [11]:
from hmmlearn import hmm
import numpy as np

def train_model_hmm(train_dir):
    hmm_models = []
    # iterate through train dir
    for label in os.listdir(train_dir):
        label_dir = os.path.join(train_dir, label)
        
        # start training
        X = np.array([])
        train_files = [x for x in os.listdir(label_dir) if x.endswith('.wav')]
        for file_name in train_files:
            file_path = os.path.join(label_dir, file_name)
            print(label, file_path)
            try:
              features_mfcc = mfcc(file_path)
            
              # append mfcc to X
              if len(X) == 0:
                  X = features_mfcc
              else:
                  X = np.append(X, features_mfcc, axis=0)
            except:
              continue
              
        # HMM with Mixture of Gaussians
        model = hmm.GMMHMM(n_components=4, n_iter=1000)

        # fit model
        np.seterr(all='ignore')
        model.fit(X)
        hmm_models.append((model, label))
    return hmm_models


def predict_hmm(hmm_models, test_file):
    # get mfcc features
    features_mfcc = mfcc(test_file)

    # calculate the score and get the maximum score
    max_score = -float('inf')
    result = ""
    for item in hmm_models:
        model, label = item
        score = model.score(features_mfcc)
        if score > max_score:
            max_score = score
            result = label
    return result
    

In [12]:
train_path = "/content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train"
hmm_models = train_model_hmm(train_path)

len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file0.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file26.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file14.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file17.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file11.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file13.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file8.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file10.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file7.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file9.wav
len /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/len/file22.wav
len /content/drive/MyDrive/15 (nhóm có stt 14

  n_fft, y.shape[-1]


ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file134.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file171.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file242.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file276.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file316.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file175.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file345.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file233.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file96.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file182.wav
ban /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/ban/file192.wav
ban /content/drive/MyDrive/15 (nh

  n_fft, y.shape[-1]


b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file395.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file337.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file316.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file305.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file399.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file298.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file393.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file321.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file370.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file322.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file391.wav
b /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/b/file

  n_fft, y.shape[-1]


nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file353.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file336.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file393.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file383.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file294.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file376.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file281.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file258.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file394.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file365.wav
nhay /content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/train/nhay/file213.wav
nhay /cont

In [30]:
pred_dir = "/content/drive/MyDrive/15 (nhóm có stt 14 trong danh sách)/test/file128.wav"
sound, sr = librosa.load(pred_dir)
pred = predict_hmm(hmm_models, pred_dir)
print(pred)
idsp.Audio(data=sound, rate=sr)

nhay
