In [1]:
import os
import librosa
import numpy as np
from random import shuffle
from collections import Counter
from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout

Using TensorFlow backend.


In [2]:
# 读取数字语音文件
def get_number_speech_mfcc_files(file_path, batch_size=10):
    mfcc_batch_features = []
    labels = []
    files = os.listdir(file_path)
    print("共加载文件数：", len(files))
    shuffle(files)
    for file in files:
        if not file.endswith(".wav"):
            continue
        # print(file[0])
        labels.append(int(file[0]))

        number_speech_file = file_path + file

        completion_mfcc_data(mfcc_batch_features, number_speech_file)

    mfcc_batch_features = np.array(mfcc_batch_features)
    labels = np.array(labels)
    print('mfcc_batch_features.shape =', mfcc_batch_features.shape)
    print('labels.shape =', labels.shape)
    return mfcc_batch_features, labels

# 补齐mfcc数据
def completion_mfcc_data(mfcc_batch_features, speech_file, max_text_speech_len=80):
    speech_file, sr = librosa.load(speech_file, mono=True)
    mfcc = librosa.feature.mfcc(speech_file, sr)
    # print(np.array(mfcc).shape)
    # 补齐  (0, 0) 表示水平方向上面下面都加0行
    #       (0, 80 - len(mfcc[0])) 表示左面加0列，右面加(max_text_speech_len - len(mfcc[0]))列，保证一共max_text_speech_len列
    mfcc_pad = np.pad(mfcc, ((0, 0), (0, max_text_speech_len - len(mfcc[0]))), mode='constant', constant_values=0)
    mfcc_batch_features.append(mfcc_pad)

In [3]:
def create_model(width=20, height=80):
    batch_size = 64
    classes = 10

    print('建立模型...')
    model = Sequential()
    model.add(LSTM(128 * 4, input_shape=(width, height)))
    model.add(Dropout(0.5))
    model.add(Dense(classes, activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print('建立模型成功...')

    return model

In [5]:
file_path = '../../data/numbers_speech/'

print('获取训练数据...')
train_data, labels = get_number_speech_mfcc_files(file_path)
train_one_hot = np_utils.to_categorical(labels)
print('获取训练数据成功...')
print(train_data.shape[1])
print(train_data.shape[2])
print('开始训练...')
model = create_model(train_data.shape[1], train_data.shape[2])
model.fit(train_data, train_one_hot, validation_split=0.2, batch_size=128, epochs=30, verbose=2)
print('训练完成...')
model.save_weights("../savemodel/speechModel.h5")
print("保存成功")

获取训练数据...
共加载文件数： 2400
mfcc_batch_features.shape = (2400, 20, 80)
labels.shape = (2400,)
获取训练数据成功...
20
80
开始训练...
建立模型...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 512)               1214464   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                5130      
Total params: 1,219,594
Trainable params: 1,219,594
Non-trainable params: 0
_________________________________________________________________
建立模型成功...
Train on 1920 samples, validate on 480 samples
Epoch 1/30
17s - loss: 2.1935 - acc: 0.1917 - val_loss: 1.9882 - val_acc: 0.2646
Epoch 2/30
15s - loss: 1.8346 - acc: 0.3542 - val_loss: 1.8047 - val_acc: 0.3271
Epoch 3/30
15s - loss: 1.6232 - acc: 0.3948 - val_loss