In [2]:
import os  # 用于获取文件路径
import librosa  # 读取声音并获取mfcc特征
import tqdm  # 用于进度条的生成
import numpy as np  # 用于一些计算
import pandas as pd  # 用于存储数据集
from tensorflow import keras  # tensorflow的高级api，用于生成模型并训练
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout  # 创建神经网络时需要一些层
from sklearn.model_selection import train_test_split  # 用于划分数据集

In [3]:
# 获取数据的路径并标注
path = r".\data"  # 数据集所在文件夹
data = []  # 生成一个空列表，用于存储数据
for i in os.listdir(path):  # 遍历根文件夹下的所有文件夹
    for j in tqdm.tqdm(os.listdir(os.path.join(path, i))):  # 遍历子文件夹中的所有数据
        filepath = os.path.join(path, i, j)  # 获取音频数据的完整路径
        # 读取音频文件，获得音频数据和采样率
        y, sr = librosa.load(filepath, sr=44100, res_type='kaiser_fast')
        
        input_len = sr * 5      # 5秒长度
        if len(y) > input_len:  # 音频过长，裁剪
            max_offset = len(y) - input_len
            offset = np.random.randint(max_offset)
            y = y[offset: (input_len + offset)]
        elif input_len > len(y):  # 音频过短，填充
            max_offset = input_len - len(y)
            offset = np.random.randint(max_offset)
            y = np.pad(y, (offset, input_len - len(y) - offset), "constant")

        # 计算梅尔频谱(mel spectrogram),并把它作为特征
        mels = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
        data.append([filepath, int(i[2])-1 if i[1] ==
                    '0' else int(i[1:3])-1, mels])  # 添加到列表中去
# 创建一个DataFrame用于存储数据集信息
data = pd.DataFrame(data=data, columns=['文件路径', '类别', 'mfcc特征'])
# 将数据集信息保存至csv文件中
data.to_csv("./dataset.csv", columns=['文件路径', '类别'], index=False)


100%|██████████| 39/39 [00:00<00:00, 44.51it/s]
100%|██████████| 39/39 [00:00<00:00, 43.27it/s]
100%|██████████| 39/39 [00:00<00:00, 44.84it/s]
100%|██████████| 39/39 [00:00<00:00, 44.23it/s]
100%|██████████| 39/39 [00:00<00:00, 43.39it/s]
100%|██████████| 39/39 [00:00<00:00, 50.57it/s]
100%|██████████| 39/39 [00:00<00:00, 45.38it/s]
100%|██████████| 39/39 [00:00<00:00, 45.02it/s]
100%|██████████| 39/39 [00:00<00:00, 51.45it/s]
100%|██████████| 39/39 [00:00<00:00, 43.68it/s]


In [4]:
# 对数据集进行进一步处理，并将数据集划分为训练集和测试集
X = np.vstack(data['mfcc特征'])  # 创建关于声音的mfcc特征的数据集
X = X.reshape(-1, 16, 8, 1)     # 将特征值变为16*8*1的张量
Y = keras.utils.to_categorical(data['类别'])  # 创建数据集的标签，并转化为onehot编码
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, random_state=11)  # 划分数据集
print('训练集的大小', len(X_train))
print('测试集的大小', len(X_test))


训练集的大小 292
测试集的大小 98


In [5]:
# 建立模型
input_dim = (16, 8, 1)
model = keras.models.Sequential()  # 建立神经网络
model.add(Conv2D(64, (3, 3), padding="same",
          activation="tanh", input_shape=input_dim))  # 添加卷积层
model.add(MaxPool2D(pool_size=(2, 2)))  # 添加最大池化
model.add(Conv2D(128, (3, 3), padding="same", activation="tanh"))  # 添加卷积层
model.add(MaxPool2D(pool_size=(2, 2)))  # 添加最大池化层
model.add(Dropout(0.1)) #舍弃一部分数据
model.add(Flatten())  # 展开
model.add(Dense(1024, activation="tanh"))
model.add(Dense(10, activation="softmax"))  # 输出层：10个units输出10个类的概率


In [6]:
# 编译模型，设置损失函数，优化方法以及评价标准
model.compile(optimizer='adam', loss='categorical_crossentropy',
              metrics=['accuracy'])


In [7]:
# 训练模型
model.fit(X_train, Y_train, epochs=50, batch_size=50,
          validation_data=(X_test, Y_test))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2569cb4aac0>

In [7]:
# 保存模型
keras.models.save_model(model, "Model")


INFO:tensorflow:Assets written to: Model\assets


In [8]:
# 建立类别标签，不同类别对应不同的数字。
label_dict = {0: "Dog bark", 1: "Rain", 2: "Sea waves", 3: "Baby cry", 4: "Clock tick",
              5: "Person sneeze", 6: "Helicopter", 7: "Chainsaw", 8: "Rooster", 9: "Fire crackling"}


In [9]:
# 使用数据集对训练出的模型进行验证
test_set_result = {"实际类别": [], "预测类别": []}  # 用于保存验证结果
prediction = np.argmax(model.predict(X_test), axis=1)  # 使用模型进行验证,并将结果转换为正常编码
init_result = np.argmax(Y_test, axis=1)  # 将验证集标签转换为正常编码
correct = 0  # 用于统计错误的次数
for i, j in zip(init_result, prediction):  # 遍历实际结果和预测结果,并将其放入到test_set_result字典中
    test_set_result['实际类别'].append(label_dict[i])
    test_set_result['预测类别'].append(label_dict[j])
    if i == j:  # 统计错误率
        correct = correct+1
result=pd.DataFrame(test_set_result)
print(result.head(5))
result.to_csv("test_set_result.csv", index=False)  # 将预测结果和实际结果写入到csv文件中
print("预测结果已保存至./test_set_result.csv文件中，请查看！")
print("测试集准确率为{:.2%}".format(correct/len(init_result)))  # 打印错误率


             实际类别            预测类别
0      Clock tick      Clock tick
1   Person sneeze   Person sneeze
2      Clock tick      Helicopter
3  Fire crackling  Fire crackling
4   Person sneeze   Person sneeze
预测结果已保存至./test_set_result.csv文件中，请查看！
测试集准确率为72.45%


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 16, 8, 64)         640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 8, 4, 64)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 4, 128)         73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 4, 2, 128)         0         
_________________________________________________________________
dropout (Dropout)            (None, 4, 2, 128)         0         
_________________________________________________________________
flatten (Flatten)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 1024)              1