In [1]:
import os
import wave
import numpy as np
import scipy.io.wavfile as wav
from python_speech_features import mfcc, delta
from scipy import signal
import re

# 读取已经用 HTK 计算好的 MFCC 特征
RATE = 16000


def extract_MFCC(audio):
    wav_feature = mfcc(audio, samplerate=RATE, numcep=13, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0,
                       highfreq=None, preemph=0.97)
    d_mfcc_feat = delta(wav_feature, 1)
    d_mfcc_feat2 = delta(wav_feature, 2)
    feature_mfcc = np.hstack((wav_feature, d_mfcc_feat, d_mfcc_feat2))
    return feature_mfcc


# extract_MFCC(myrecording2).shape

In [2]:
from Utils.record import record_once
import sounddevice as sd
from scipy.io.wavfile import write, read

sos = signal.butter(99, [2 * 250, 2 * 3e3], fs=RATE, output='sos', btype='bandpass')


def bandpass(wav_data, order, fre_c):
    filtedData = signal.sosfilt(sos, wav_data)  # data为要过滤的信号
    # return np.reshape(filtedData, (len(filtedData), 1))
    return filtedData

# myrecording = record_once(1, RATE, 3, 'one', 'test_data')

# myrecording2 = bandpass(myrecording, 99, 3e3)
# sd.play(myrecording2, samplerate=RATE)
# sd.wait()


In [38]:
###
# sos = signal.butter(99, [2*250, 2 * 3e3], fs=RATE, output='sos', btype='bandpass')
fs, myrecording = read(
    r"D:\Program\pyProject\speech_signal_processing\ThirdPartyRepos\MFCC\test_data\one_1669015379.wav")
# myrecording2 = signal.sosfilt(sos, myrecording.ravel())  # data为要过滤的信号
myrecording2 = bandpass(myrecording, 99, 3e3)
sd.play(myrecording2, samplerate=RATE)
sd.wait()


In [3]:


def getMFCC(datapath):
    labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    MFCC = []
    files = os.listdir(datapath)  # 得到文件夹下的所有文件名称
    rule = re.compile(r'(.*?)_.*?')
    for i in range(10):
        MFCC_rows = []
        for file in files:  # 遍历文件夹
            label = re.findall(rule, str(file))
            label = ''.join(label)
            if label == labels[i]:
                file_name = os.path.join(datapath, file)
                fs, audio = wav.read(file_name)  # audio: (len, )
                feature = extract_MFCC(bandpass(audio, 99, 3e3))
                MFCC_rows.append(feature)
        MFCC.append(MFCC_rows)
    return MFCC


_datapath = r"D:\Program\pyProject\DSP_SpeechNumberRecognization\dataset_test"
_MFCC = getMFCC(_datapath)


In [4]:

def getMFCC_train(datapath):
    labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    MFCC = []
    for i in labels:
        MFCC_rows = []
        path = os.path.join(datapath, i)
        # print(path)
        for file in os.listdir(path):  # 遍历文件夹
            file_name = os.path.join(datapath, i, file)
            fs, audio = wav.read(file_name)  # audio: (len, )
            feature = extract_MFCC(bandpass(audio, 99, 3e3))
            MFCC_rows.append(feature)
        MFCC.append(MFCC_rows)
    return MFCC


_datapath = r"D:\Program\pyProject\DSP_SpeechNumberRecognization\dataset"
_MFCC = getMFCC_train(_datapath)

In [64]:

# DTW 算法...
def dtw(M1, M2):
    # 初始化数组 大小为 M1 * M2
    M1_len = len(M1)
    M2_len = len(M2)
    cost = [[0 for i in range(M2_len)] for j in range(M1_len)]

    # 初始化 dis 数组
    dis = []
    for i in range(M1_len):
        dis_row = []
        for j in range(M2_len):
            dis_row.append(distance(M1[i], M2[j]))
        dis.append(dis_row)

    # 初始化 cost 的第 0 行和第 0 列
    cost[0][0] = dis[0][0]
    for i in range(1, M1_len):
        cost[i][0] = cost[i - 1][0] + dis[i][0]
    for j in range(1, M2_len):
        cost[0][j] = cost[0][j - 1] + dis[0][j]

    # 开始动态规划
    for i in range(1, M1_len):
        for j in range(1, M2_len):
            cost[i][j] = min(cost[i - 1][j] + dis[i][j] * 1, cost[i - 1][j - 1] + dis[i][j] * 2,
                             cost[i][j - 1] + dis[i][j] * 1)
    return cost[M1_len - 1][M2_len - 1]


# 两个维数相等的向量之间的距离
def distance(x1, x2):
    sum = 0
    for i in range(len(x1)):
        sum = sum + abs(x1[i] - x2[i])
    return sum


def train_model(path):
    # 存储所有语音文件的 MFCC 特征
    # 读取已经用 HTK 计算好的 MFCC 特征
    MFCC = getMFCC(path)
    return MFCC


def speech_recognition(MFCC_models, wave_data):
    MFCC_recorded = extract_MFCC(wave_data)

    # 进行匹配
    flag = 0
    # min_dis = dtw(MFCC_recorded, MFCC_models[0][0])
    min_dis = 0xffffff
    for j in range(0, len(MFCC_models)):
        diss = []
        for mfcc_i in MFCC_models[j][:10]:
            dis = dtw(MFCC_recorded, mfcc_i)
            diss.append(dis)
        dis = np.mean(diss)
        # print(diss)
        if dis < min_dis:
            min_dis = dis
            flag = j
    return flag

In [66]:
import librosa
from pydub import AudioSegment
from pydub.silence import split_on_silence
import sounddevice as sd


def get_nparray(audiosegment):
    samples = audiosegment.get_array_of_samples()
    samples_float = librosa.util.buf_to_float(samples, n_bytes=2, dtype=np.float32)
    if audiosegment.channels == 2:
        sample_left = np.copy(samples_float[::2])
        sample_right = np.copy(samples_float[1::2])
        sample_all = np.array([sample_left, sample_right])
    else:
        sample_all = samples_float

    return sample_all


def get_longest_chunk(path):
    data = AudioSegment.from_wav(path)
    # print(f"db = {data.dBFS}")
    chunks = split_on_silence(data, min_silence_len=50, keep_silence=10, silence_thresh=data.dBFS - 5)
    lid = 0
    llen = 0
    for i, _c in enumerate(chunks):
        if len(_c) > llen:
            llen = len(_c)
            lid = i
    return get_nparray(chunks[lid])


# sd.play(get_nparray(data))

def mfcc_test(datapath):
    labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    MFCC = []
    files = os.listdir(datapath)  # 得到文件夹下的所有文件名称
    for i in range(10):
        for file in files:  # 遍历文件夹
            rule = re.compile(r'(.*?)_.*?')
            label = re.findall(rule, str(file))
            label = ''.join(label)
            if label == labels[i]:
                file_name = os.path.join(datapath, file)
                audio = get_longest_chunk(file_name)
                print(f"{file} <-> {speech_recognition(_MFCC, bandpass(audio, 99, 3e3))}")


mfcc_test(r"D:\Program\pyProject\DSP_SpeechNumberRecognization\dataset_test")
# print(speech_recognition(_MFCC, myrecording2))


zero_79741.wav <-> 4
one_54027.wav <-> 7
one_77038.wav <-> 2
two_39242.wav <-> 4
two_47047.wav <-> 2
three_35168.wav <-> 2
three_45407.wav <-> 9
three_77900.wav <-> 4
four_17868.wav <-> 2
four_59485.wav <-> 9
five_71840.wav <-> 9
five_85869.wav <-> 4
five_86674.wav <-> 2
six_98475.wav <-> 2
six_99035.wav <-> 2
seven_69828.wav <-> 1
seven_77171.wav <-> 2
seven_91891.wav <-> 9
eight_18128.wav <-> 2
eight_80167.wav <-> 9


KeyboardInterrupt: 

# Adaboost

In [6]:
_MFCC_cleaned = []
for digit, digits_mfcc in enumerate(_MFCC):
    _MFCC_cleaned.append([])
    for digit_mfcc in digits_mfcc:
        if digit_mfcc.shape[0] == 99:  # 1s
            _MFCC_cleaned[digit].append(digit_mfcc)

In [73]:
del _MFCC

In [7]:
_MFCC_np = []
_labels = []
for digit, mfccs in enumerate(_MFCC_cleaned):
    mfccs = np.dstack(mfccs).transpose(2, 0, 1)
    l, m, n = mfccs.shape
    mfccs = mfccs.reshape((l, m * n))
    _MFCC_np.append(mfccs)
    _labels.append(np.asarray([digit for i in range(len(mfccs))]))

_MFCC_np = np.concatenate(_MFCC_np)
_labels = np.concatenate(_labels)
np.save("MFCC_np.npy", _MFCC_np)
np.save("labels.npy", _labels)

In [134]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(_MFCC_np, _labels, test_size=0.33, random_state=42)
X_train_len, X_test_len = len(X_train), len(X_test)
ratio = 0.7
X_train = X_train[:int(X_train_len * ratio)]
X_test = X_test[:int(X_test_len * ratio)]
y_train = y_train[:int(X_train_len * ratio)]
y_test = y_test[:int(X_test_len * ratio)]
X_train_len, X_test_len = len(X_train), len(X_test)


In [176]:
np.save("MFCC_np.npy", _MFCC_np)

In [85]:
from sklearn.tree import ExtraTreeClassifier

abc = AdaBoostClassifier(base_estimator=ExtraTreeClassifier(max_depth=13), n_estimators=600, random_state=233)

abc.fit(X_train, y_train)
abc.score(X_test, y_test)

0.4928169893816365

In [88]:
from sklearn.model_selection import GridSearchCV
# Find Best parameter first
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

param_grid = {"base_estimator": [ExtraTreeClassifier(max_depth=i) for i in [10, 50, 100]],
              "n_estimators": [300, 600, 900],
              }
grid_search = GridSearchCV(AdaBoostClassifier(random_state=233), param_grid, cv=5, n_jobs=8)

# Fit Model
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Train Score: {grid_search.best_score_}")

Best Parameters: {'base_estimator': ExtraTreeClassifier(max_depth=10), 'n_estimators': 900}
Best Train Score: 0.43822599893255293


# SVM

In [86]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train, y_train)
svc.score(X_test, y_test)

0.6574016239850093

In [89]:
param_grid = {"C": [0.1, 1, 10],
              "kernel": ['rbf', 'poly', 'rbf']}
# grid_search = GridSearchCV(SVC(gamma='auto'), param_grid, cv=4, n_jobs=8)
grid_search = GridSearchCV(SVC(gamma='scale'), param_grid, cv=4, n_jobs=8)

# Fit Model
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Train Score: {grid_search.best_score_}")

score = grid_search.score(X_train, y_train)
print(f"Test Score: {score} ")

Best Parameters: {'C': 10, 'kernel': 'poly'}
Best Train Score: 0.6317087351486381
Test Score: 1.0 


In [90]:
score = grid_search.score(X_test, y_test)
print(f"Test Score: {score} ")

Test Score: 0.6718200983836964 


# MLP

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class mlp(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Sequential(nn.Linear(99*39, 1000))
        self.layer2 = nn.Sequential(nn.Linear(1000, 1000))
        self.layer3 = nn.Sequential(nn.Linear(1000, 200))
        self.layer4 = nn.Sequential(nn.Linear(200, 200))
        self.layer5 = nn.Sequential(nn.Linear(200, 10))

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x) + x)
        x = torch.relu(self.layer3(x))
        x = torch.relu(self.layer4(x) + x)
        x = self.layer5(x)
        return F.log_softmax(x, dim=1)


_model = mlp().float()#.cuda()


In [139]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class MfccData(Dataset):
    def __len__(self):
        return len(X_train)
    def __getitem__(self, index):
        return X_train[index], y_train[index]

bs = 1000
batch = 0

_loss_fn = nn.CrossEntropyLoss()
_optimizer = torch.optim.SGD(_model.parameters(), lr=1e-3)

_mfcc_data = DataLoader(MfccData(), batch_size=bs, shuffle=True, num_workers=0)

In [137]:
def eval():
    results = []
    test_bs = 100
    for _i in range(4):
        _index = np.random.choice(range(X_test.shape[0]), replace=False, size=(test_bs,))
        _X = torch.from_numpy(X_test[_index, :]).float()
        _y = (torch.from_numpy(y_test[_index])).long()
        out = _model(_X)
        _,pred = out.max(1)
        num_correct = (pred == _y).sum().item()
        result = num_correct / _y.shape[0]
        # print(result)   #64
        results.append(result)
    # print(f"avg: {np.mean(results)}")
    return np.mean(results)

In [140]:
while True:
    for b, (_X, _y) in enumerate(_mfcc_data):
        _model.train()
        _X = _X.float()#.cuda()
        _y = _y.long()#.cuda()
        pred = _model(_X)
        loss = _loss_fn(pred, _y)

        # BP
        _optimizer.zero_grad()
        loss.backward()
        _optimizer.step()
        batch += 1
        if batch % 50 == 0:
            _model.eval()
            eval_score = eval()
            print(f"{batch}, {loss} <-> {eval_score}")



50, 0.9982843399047852 <-> 0.5475
100, 1.0033979415893555 <-> 0.6125
150, 0.8210480809211731 <-> 0.6325000000000001
200, 0.7832427620887756 <-> 0.6225
250, 0.8252797722816467 <-> 0.6275000000000001
300, 0.8073787689208984 <-> 0.66
350, 0.7030922770500183 <-> 0.6525
400, 0.74298495054245 <-> 0.65
450, 0.6984407305717468 <-> 0.685
500, 0.656196117401123 <-> 0.635
550, 0.621052622795105 <-> 0.6775
600, 0.5951574444770813 <-> 0.6775
650, 0.6119892597198486 <-> 0.6775
700, 0.5497257709503174 <-> 0.705
750, 0.5174782276153564 <-> 0.655
800, 0.5516507625579834 <-> 0.7025
850, 0.4545406997203827 <-> 0.6525
900, 0.4911038279533386 <-> 0.6825000000000001
950, 0.461279034614563 <-> 0.7224999999999999
1000, 0.43532228469848633 <-> 0.675
1050, 0.41458234190940857 <-> 0.68
1100, 0.3481624126434326 <-> 0.6775
1150, 0.4127042293548584 <-> 0.6925
1200, 0.4236237108707428 <-> 0.71
1250, 0.3664504587650299 <-> 0.73
1300, 0.33918970823287964 <-> 0.675
1350, 0.3182563781738281 <-> 0.67
1400, 0.337247133255

KeyboardInterrupt: 

In [175]:

from pydub.silence import detect_nonsilent
def mfcc_test_mlp(datapath):
    labels = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    MFCC = []
    files = os.listdir(datapath)  # 得到文件夹下的所有文件名称
    for i in range(10):
        for file in files:  # 遍历文件夹
            rule = re.compile(r'(.*?)_.*?')
            label = re.findall(rule, str(file))
            label = ''.join(label)
            if label == labels[i]:
                file_name = os.path.join(datapath, file)
                data = AudioSegment.from_wav(file_name)
                # print(f"db = {data.dBFS}")
                secs = detect_nonsilent(data, min_silence_len=50, silence_thresh=data.dBFS - 6)
                max_len = 0
                max_i = 0
                for i, sec in enumerate(secs):
                    sec_diff = sec[1] - sec[0]
                    if sec_diff > max_len:
                        max_len = sec_diff
                        max_i = i
                # 32000~199(200)~2000
                # 16000~99(100)~1000
                # sec_start, sec_end = int(secs[max_i][0] / 10), int(secs[max_i][0] / 10) + 99
                sec_start, sec_end = int(secs[0][0] / 10), int(secs[0][0] / 10) + 99

                fs, audio = wav.read(file_name)  # audio: (len, )
                feature = extract_MFCC(bandpass(audio, 99, 3e3))[sec_start:sec_end, :]
                import numpy as np

                feature = np.pad(feature, ((0, 99 - feature.shape[0]), (0, 0)), mode='median')
                print(feature.shape)
                feature = feature.ravel().reshape((1, -1))
                _X = torch.from_numpy(feature).float()
                out = _model(_X)
                _, pred = out.max(1)

                print(f"{file} <-> {pred}")
mfcc_test_mlp(r"D:\Program\pyProject\DSP_SpeechNumberRecognization\dataset_test")

(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
one_54027.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
two_39242.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
three_35168.wav <-> tensor([2])
(99, 39)
four_17868.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
five_71840.wav <-> tensor([8])
(99, 39)
two_39242.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
six_98475.wav <-> tensor([8])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
seven_69828.wav <-> tensor([8])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
eight_18128.wav <-> tensor([2])
(99, 39)
one_54027.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])
(99, 39)
nine_83327.wav <-> tensor([1])
(99, 39)
zero_79741.wav <-> tensor([2])


In [163]:
from pydub.silence import detect_nonsilent
data = AudioSegment.from_wav(test_file)
# print(f"db = {data.dBFS}")
secs = detect_nonsilent(data, min_silence_len=50, silence_thresh=data.dBFS - 6)
max_len = 0
max_i = 0
for i, sec in enumerate(secs):
    sec_diff = sec[1] - sec[0]
    if sec_diff > max_len:
        max_len = sec_diff
        max_i = i
# 32000~199(200)~2000
# 16000~99(100)~1000
# sec_start, sec_end = int(secs[max_i][0] / 10), int(secs[max_i][0] / 10) + 99
sec_start, sec_end = int(secs[0][0] / 10), int(secs[0][0] / 10) + 99

In [173]:
test_file = r"D:\Program\pyProject\DSP_SpeechNumberRecognization\dataset_test\eight_18128.wav"
fs, audio = wav.read(test_file)  # audio: (len, )
feature = extract_MFCC(bandpass(audio, 99, 3e3))[sec_start:sec_end, :]
import numpy as np
feature = np.pad(feature, ((0, 99-feature.shape[0]), (0, 0)), mode='median')
print(feature.shape)
feature = feature.ravel().reshape((1, -1))
_X = torch.from_numpy(feature).float()
out = _model(_X)
_,pred = out.max(1)
print(out, pred)

(99, 39)
tensor([[-6.5244, -1.3311, -0.6291, -3.0086, -7.1605, -8.0040, -8.8795, -6.5457,
         -2.0606, -3.8243]], grad_fn=<LogSoftmaxBackward>) tensor([2])
