In [1]:
import wave
import scipy
import pylab
import librosa
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt
import soundfile
import os
import glob
from sklearn.model_selection import train_test_split
from datetime import datetime
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import sklearn

In [54]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard, ModelCheckpoint
from keras.optimizers import SGD,Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Flatten
from keras.layers.convolutional import MaxPooling2D,AveragePooling2D
from keras.layers.convolutional import SeparableConv2D, Conv2D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation
from keras.layers.core import Dropout
from keras.layers.core import Dense
from keras.models import Sequential
from keras import backend as K
from keras import layers
from keras import models
import tensorflow as tf
from keras.metrics import categorical_accuracy
from tensorboard.backend.event_processing import event_accumulator

In [3]:
train_files = glob.glob("G:\\GTA_audios\\input\\audio_train\\*.wav")  # 训练集
test_files = glob.glob("G:\\GTA_audios\\input\\audio_test\\*.wav")   # 测试集
labels = pd.read_excel("G:\\GTA_audios\\dataset.xlsx") # 训练集的标签

In [4]:
input_length = 48000*3  ## 表示输入语音的长度，48000表示采样率大小，3表示音频长度大小为3秒，可以根据具体情况进行修改
frame_size = 2048 # 每一帧的长度
hop_size =512 # 帧移大小
def load_audio_file(file_path):
    data, fs = soundfile.read(file=file_path)
    data = data.T
    if len(data)>input_length:  ## 如果音频的长度较长，则通过随机数的形式确定截取区间的范围
        max_offset = len(data)-input_length
        offset = np.random.randint(max_offset)
        data = data[offset:(input_length+offset)]
    elif len(data)<input_length: ## 如果音频的长度不足，通过随机选择padding的界限，将音频尽量集中到中间部分
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
    else:
        pass
    #Normalize data
    mean_value = np.mean(data)
    data -= mean_value
    max_value = max(abs(data)) + 0.05 #avoid per zero div
    data = data/max_value
    data = np.reshape(data,[-1,1])
    feature_matrix = get_mel_spectrogram(data, fs)
    return feature_matrix

In [5]:
def get_mel_spectrogram(audio,sr):
    eps=2.220446049250313e-16
    audio = audio.reshape([1,-1])
    ms = int(0.04*sr) #40ms at 44100 Hz
    window = scipy.signal.hamming(
                                ms,
                                sym=False
                                )
    mel_basis = librosa.filters.mel(sr=sr,
                                    n_fft=2048,
                                    n_mels=128,
                                    htk=False,
                                    norm=None
                                    )
    feature_matrix = np.empty((0,128))
    hop_length = int(sr/50)
    stft = librosa.stft(audio[0,:]+eps,
                            n_fft=2048,
                            win_length=ms,
                            hop_length=hop_length,
                            center=True,
                            window=window
                            )
    # print("stft shape : {}".format(stft.shape))
    spectrogram = np.abs(stft)**2
    mel_spectrogram = np.dot(mel_basis,spectrogram)
    mel_spectrogram = mel_spectrogram.T
    mel_spectrogram = np.log10(mel_spectrogram + eps)
    feature_matrix = np.append(feature_matrix,mel_spectrogram,axis=0)
    return feature_matrix

In [6]:
# 构建文件标签字典，通过文件名获取对应音频的标签
# 先构造单标签映射的字典
file_to_label = {"G:\\GTA_audios\\input\\audio_train\\"+k:v for k,v in zip(labels["sample"].values, labels["speed"].values)}
list_labels = sorted(list(set(file_to_label.values()))) ## 将所有训练集样本的标签首先构造集合去重，然后排序
label_to_int = {k:v for v,k in enumerate(list_labels)} ## 将每种标签映射到0，1，2……
int_to_label = {v:k for k,v in label_to_int.items()} # 反转
file_to_int = {k:label_to_int[v] for k,v in file_to_label.items()} # 文件名映射到标签值

In [7]:
train_files = list(file_to_label.keys())
train_labels = [label_to_int[x] for x in file_to_label.values()]

In [42]:
train_features = [load_audio_file(x) for x in train_files]

In [43]:
train_features = np.array(train_features)[:,:,:,np.newaxis]
train_labels = np.array(train_labels)
train_labels = train_labels.astype(np.float64)

In [44]:
train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(
    train_features,
    train_labels,
    random_state=42,
    train_size=0.8,
    test_size=0.2
)

In [45]:
labelencoder = LabelEncoder()
train_label = to_categorical(labelencoder.fit_transform(train_label))
test_label = to_categorical(labelencoder.fit_transform(test_label))
num_labels=train_label.shape[1]

In [60]:
def gen_model():
    input = layers.Input(shape=(151,128,1))
    # First conv layer
    c_1 = layers.Conv2D(48,(3,8),padding='same')(input)
    c_2 = layers.Conv2D(32,(3,32),padding='same')(input)
    c_3 = layers.Conv2D(16,(3,64),padding='same')(input)
    c_4 = layers.Conv2D(16,(3,90),padding='same')(input)
    conv_1 = layers.Concatenate()([c_1,c_2,c_3,c_4])
    x = layers.BatchNormalization()(conv_1)
    x = layers.ReLU()(x)
#     x = layers.MaxPooling2D((5,5))(x)
    x = layers.AveragePooling2D((5,5))(x)
    # Second conv layer
    x = layers.Conv2D(224,5)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
#     x = layers.MaxPooling2D((11,4))(x)
    x = layers.AveragePooling2D((6,4))(x)
    # Output layer
    x = layers.Flatten()(x)
    # x = layers.Dropout(0.5)(x)
    x = layers.Dense(64)(x)
    x = layers.Dense(5,activation='softmax')(x)
    model = models.Model(input,x)
    return model

In [61]:
model = gen_model()

In [62]:
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 151, 128, 1)  0                                            
__________________________________________________________________________________________________
conv2d_36 (Conv2D)              (None, 151, 128, 48) 1200        input_8[0][0]                    
__________________________________________________________________________________________________
conv2d_37 (Conv2D)              (None, 151, 128, 32) 3104        input_8[0][0]                    
__________________________________________________________________________________________________
conv2d_38 (Conv2D)              (None, 151, 128, 16) 3088        input_8[0][0]                    
____________________________________________________________________________________________

In [63]:
opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=opt,loss ='categorical_crossentropy' , metrics=['accuracy'])
filepath="model_{epoch:02d}-{val_accuracy:.2f}.h5"
checkpoint=ModelCheckpoint(
        filepath=filepath,
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1,
        save_weights_only=True,
        period=1
    )
tensorboard = TensorBoard(log_dir='./log_200epoch')
callbacks_list = [tensorboard, checkpoint]
hist = model.fit(train_data, train_label, validation_data=(test_data,test_label),batch_size=32, epochs=200, callbacks=callbacks_list)

Train on 609 samples, validate on 153 samples
Epoch 1/200

Epoch 00001: val_accuracy improved from -inf to 0.23529, saving model to model_01-0.24.h5
Epoch 2/200

Epoch 00002: val_accuracy improved from 0.23529 to 0.41830, saving model to model_02-0.42.h5
Epoch 3/200

Epoch 00003: val_accuracy did not improve from 0.41830
Epoch 4/200

Epoch 00004: val_accuracy did not improve from 0.41830
Epoch 5/200

Epoch 00005: val_accuracy improved from 0.41830 to 0.45098, saving model to model_05-0.45.h5
Epoch 6/200

Epoch 00006: val_accuracy did not improve from 0.45098
Epoch 7/200

Epoch 00007: val_accuracy did not improve from 0.45098
Epoch 8/200

Epoch 00008: val_accuracy improved from 0.45098 to 0.50980, saving model to model_08-0.51.h5
Epoch 9/200

Epoch 00009: val_accuracy did not improve from 0.50980
Epoch 10/200

Epoch 00010: val_accuracy did not improve from 0.50980
Epoch 11/200

Epoch 00011: val_accuracy did not improve from 0.50980
Epoch 12/200

Epoch 00012: val_accuracy did not improve


Epoch 00040: val_accuracy did not improve from 0.51634
Epoch 41/200

Epoch 00041: val_accuracy did not improve from 0.51634
Epoch 42/200

Epoch 00042: val_accuracy did not improve from 0.51634
Epoch 43/200

Epoch 00043: val_accuracy did not improve from 0.51634
Epoch 44/200

Epoch 00044: val_accuracy did not improve from 0.51634
Epoch 45/200

Epoch 00045: val_accuracy did not improve from 0.51634
Epoch 46/200

Epoch 00046: val_accuracy did not improve from 0.51634
Epoch 47/200

Epoch 00047: val_accuracy did not improve from 0.51634
Epoch 48/200

Epoch 00048: val_accuracy did not improve from 0.51634
Epoch 49/200

Epoch 00049: val_accuracy did not improve from 0.51634
Epoch 50/200

Epoch 00050: val_accuracy did not improve from 0.51634
Epoch 51/200

Epoch 00051: val_accuracy improved from 0.51634 to 0.52941, saving model to model_51-0.53.h5
Epoch 52/200

Epoch 00052: val_accuracy did not improve from 0.52941
Epoch 53/200

Epoch 00053: val_accuracy did not improve from 0.52941
Epoch 54/


Epoch 00121: val_accuracy did not improve from 0.56209
Epoch 122/200

Epoch 00122: val_accuracy did not improve from 0.56209
Epoch 123/200

Epoch 00123: val_accuracy did not improve from 0.56209
Epoch 124/200

Epoch 00124: val_accuracy did not improve from 0.56209
Epoch 125/200

Epoch 00125: val_accuracy did not improve from 0.56209
Epoch 126/200

Epoch 00126: val_accuracy improved from 0.56209 to 0.57516, saving model to model_126-0.58.h5
Epoch 127/200

Epoch 00127: val_accuracy improved from 0.57516 to 0.58824, saving model to model_127-0.59.h5
Epoch 128/200

Epoch 00128: val_accuracy did not improve from 0.58824
Epoch 129/200

Epoch 00129: val_accuracy did not improve from 0.58824
Epoch 130/200

Epoch 00130: val_accuracy did not improve from 0.58824
Epoch 131/200

Epoch 00131: val_accuracy improved from 0.58824 to 0.60131, saving model to model_131-0.60.h5
Epoch 132/200

Epoch 00132: val_accuracy did not improve from 0.60131
Epoch 133/200

Epoch 00133: val_accuracy did not improve 

In [132]:
score = model.evaluate(test_data, test_label, batch_size=16)



In [133]:
score

[1.1007680738127077, 0.5324675440788269]