In [178]:
import os
import moviepy
import moviepy.editor
import numpy as np
import matplotlib.pyplot as plt
import re
import shutil
import wave
import librosa
import librosa.display
import scipy.signal as signal
import soundfile
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm, metrics
import glob
import sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from datetime import datetime
import random
import pickle

In [40]:
input_length = 48000*3  ## 表示输入语音的长度，48000表示采样率大小，3表示音频长度大小为3秒，可以根据具体情况进行修改
frame_size = 2048 # 每一帧的长度
hop_size =512 # 帧移大小

def calEnergy(frame):
    sumEnergy = 0
    for i in frame:
        sumEnergy += i**2
    return sumEnergy

def get_feature(pre_empha_wav, sr=48000):
    # 分帧加窗
    winfunc = signal.windows.hamming(frame_size)
    frames = librosa.util.frame(pre_empha_wav, frame_length=frame_size, hop_length=hop_size, axis=0)
    frames = np.array([frame * winfunc for frame in frames])
    # mfcc系数
    mfccs = librosa.feature.mfcc(y=pre_empha_wav, sr=sr, n_mfcc=24, center=False)
    mfccs_scaled_features = np.mean(mfccs.T,axis=0)
    # 短时能量
    energy = []
    for frame in frames:
        energy.append(calEnergy(frame))
    average_energy = np.average(np.array(energy))
    # 短时过零率
    zeroCrossingRate = librosa.feature.zero_crossing_rate(pre_empha_wav, frame_length=frame_size, hop_length=hop_size, center=False)[0]
    average_zcr = np.average(zeroCrossingRate)
    # 均方根能量
    rmse = librosa.feature.rms(pre_empha_wav, frame_length=frame_size, hop_length=hop_size, center=False)[0]
    average_rmse = np.average(rmse)
    # 频谱质心特征
    cent = librosa.feature.spectral_centroid(y=pre_empha_wav, sr=sr, n_fft=frame_size, hop_length=hop_size, center=False)[0] # 计算频谱质心特征
    average_cent = np.average(cent)
    # 计算二阶频谱带宽，主要是表征与频谱质心位置的偏移程度
    spec_bw = librosa.feature.spectral_bandwidth(y=pre_empha_wav, sr=sr, n_fft=frame_size, hop_length=hop_size, center=False)[0]
    average_spec_bw = np.average(spec_bw)
    # 计算声音信号频谱的平整度特征
    flatness = librosa.feature.spectral_flatness(y=pre_empha_wav, n_fft=frame_size, hop_length=hop_size, center=False)[0]
    average_flatness = np.average(flatness)
    # 滚降系数上限
    rolloff = librosa.feature.spectral_rolloff(y=pre_empha_wav, sr=sr, roll_percent=0.99, center=False)[0]
    average_rolloff = np.average(rolloff)
    # 滚降系数下限
    rolloff_min = librosa.feature.spectral_rolloff(y=pre_empha_wav, sr=sr, roll_percent=0.01, center=False)[0]
    average_rolloff_min = np.average(rolloff_min)
    # 计算六维音调质心特征
    y = librosa.effects.harmonic(pre_empha_wav)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr) 
    average_tonnetz = np.mean(tonnetz.T,axis=0)
    return np.hstack((
        mfccs_scaled_features, 
        average_energy, 
        average_zcr, 
        average_rmse,
        average_cent,
        average_spec_bw,
        average_flatness,
        average_rolloff,
        average_rolloff_min,
        average_tonnetz
    ))

def load_audio_file(file_path, input_length=input_length):
    data = librosa.core.load(file_path, sr=None)[0] #, sr=None，使用原采样率读取wav音频文件，返回值为wav，sr
    if len(data)>input_length:  ## 如果音频的长度较长，则通过随机数的形式确定截取区间的范围        
        max_offset = len(data)-input_length      
        offset = np.random.randint(max_offset)      
        data = data[offset:(input_length+offset)]        
    elif len(data)<input_length: ## 如果音频的长度不足，通过随机选择padding的界限，将音频尽量集中到中间部分
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
        else:
            offset = 0      
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")
    else:
        pass     
    feature = get_feature(data) # 获取log梅尔特征 
    return feature

In [41]:
train_files = glob.glob("G:\\GTA_audios\\input\\audio_train\\*.wav")  # 训练集
test_files = glob.glob("G:\\GTA_audios\\input\\audio_test\\*.wav")   # 测试集
labels = pd.read_excel("G:\\GTA_audios\\dataset.xlsx") # 训练集的标签

In [44]:
# 构建文件标签字典，通过文件名获取对应音频的标签
# 先构造单标签映射的字典
file_to_label = {"G:\\GTA_audios\\input\\audio_train\\"+k:v for k,v in zip(labels["sample"].values, labels["speed"].values)}
list_labels = sorted(list(set(file_to_label.values()))) ## 将所有训练集样本的标签首先构造集合去重，然后排序
label_to_int = {k:v for v,k in enumerate(list_labels)} ## 将每种标签映射到0，1，2……
int_to_label = {v:k for k,v in label_to_int.items()} # 反转
file_to_int = {k:label_to_int[v] for k,v in file_to_label.items()} # 文件名映射到标签值

In [45]:
train_files = list(file_to_label.keys())
train_labels = [label_to_int[x] for x in file_to_label.values()]

In [46]:
train_features = [load_audio_file(x) for x in train_files]

In [47]:
train_features = np.array(train_features)
train_labels = np.array(train_labels)
train_labels = train_labels.astype(np.float64)

In [199]:
train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(
    train_features,
    train_labels,
    random_state=10,
    train_size=0.7,
    test_size=0.3
)

# SVM

In [212]:
# C=4.1 gamma=0.0025 训练集准确率能够达到61%
train_score = 0
test_score = 0
classifier = svm.SVC(C=4, kernel='rbf', gamma=0.0025, probability=True)
for i in range(20):
    print("epoch %s:" % (i+1), end='  ')
    train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(
        train_features,
        train_labels,
        random_state=i,
        train_size=0.7,
        test_size=0.3
    )
    classifier.fit(train_data, train_label.ravel())
    score1 = classifier.score(train_data,train_label)
    score2 = classifier.score(test_data,test_label)
    print("训练集：%s, 测试集：%s" % (score1, score2))
    if score2 > test_score:
        train_score = score1
        test_score = score2
print("训练集：",train_score)
print("测试集：",test_score)

epoch 1:  训练集：1.0, 测试集：0.4366812227074236
epoch 2:  训练集：1.0, 测试集：0.35807860262008734
epoch 3:  训练集：1.0, 测试集：0.388646288209607
epoch 4:  训练集：1.0, 测试集：0.4672489082969432
epoch 5:  训练集：1.0, 测试集：0.4104803493449782
epoch 6:  训练集：1.0, 测试集：0.4847161572052402
epoch 7:  训练集：1.0, 测试集：0.4759825327510917
epoch 8:  训练集：1.0, 测试集：0.4410480349344978
epoch 9:  训练集：1.0, 测试集：0.388646288209607
epoch 10:  训练集：1.0, 测试集：0.43231441048034935
epoch 11:  训练集：1.0, 测试集：0.4148471615720524
epoch 12:  训练集：1.0, 测试集：0.47161572052401746
epoch 13:  训练集：1.0, 测试集：0.40611353711790393
epoch 14:  训练集：1.0, 测试集：0.4148471615720524
epoch 15:  训练集：1.0, 测试集：0.42358078602620086
epoch 16:  训练集：1.0, 测试集：0.42358078602620086
epoch 17:  训练集：1.0, 测试集：0.4192139737991266
epoch 18:  训练集：1.0, 测试集：0.42358078602620086
epoch 19:  训练集：1.0, 测试集：0.43231441048034935
epoch 20:  训练集：1.0, 测试集：0.4410480349344978
训练集： 1.0
测试集： 0.4847161572052402


In [213]:
#4.计算svc分类器的准确率
print("训练集：",classifier.score(train_data,train_label))
print("测试集：",classifier.score(test_data,test_label))

训练集： 1.0
测试集： 0.4410480349344978


# GBDT

In [186]:
train_score = 0
test_score = 0
gbt = GradientBoostingClassifier(max_depth=6, # 6/14
                                    n_estimators=80,
                                    learning_rate=0.1,
                                    min_samples_leaf=3, # 3
                                    min_samples_split=5, # 5/9
                                    subsample=0.9)
for i in range(20):
    print("epoch %s:" % (i+1), end='  ')
    train_data,test_data,train_label,test_label =sklearn.model_selection.train_test_split(
        train_features,
        train_labels,
        random_state=i,
        train_size=0.7,
        test_size=0.3
    )
    gbt.fit(train_data, train_label)
    score1 = gbt.score(train_data,train_label)
    score2 = gbt.score(test_data,test_label)
    print("训练集：%s, 测试集：%s" % (score1, score2))
    if score2 > test_score:
        train_score = score1
        test_score = score2
print("训练集：",train_score)
print("测试集：",test_score)

epoch 1:  训练集：1.0, 测试集：0.48034934497816595
epoch 2:  训练集：1.0, 测试集：0.45414847161572053
epoch 3:  训练集：1.0, 测试集：0.4890829694323144
epoch 4:  训练集：1.0, 测试集：0.47161572052401746
epoch 5:  训练集：1.0, 测试集：0.5065502183406113
epoch 6:  训练集：1.0, 测试集：0.49344978165938863
epoch 7:  训练集：1.0, 测试集：0.4672489082969432
epoch 8:  训练集：1.0, 测试集：0.43231441048034935
epoch 9:  训练集：1.0, 测试集：0.5021834061135371
epoch 10:  训练集：1.0, 测试集：0.4890829694323144
epoch 11:  训练集：1.0, 测试集：0.5065502183406113
epoch 12:  训练集：1.0, 测试集：0.5283842794759825
epoch 13:  训练集：1.0, 测试集：0.5065502183406113
epoch 14:  训练集：1.0, 测试集：0.519650655021834
epoch 15:  训练集：1.0, 测试集：0.5240174672489083
epoch 16:  训练集：1.0, 测试集：0.5283842794759825
epoch 17:  训练集：1.0, 测试集：0.5152838427947598
epoch 18:  训练集：1.0, 测试集：0.4759825327510917
epoch 19:  训练集：1.0, 测试集：0.462882096069869
epoch 20:  训练集：1.0, 测试集：0.537117903930131
训练集： 1.0
测试集： 0.537117903930131


In [198]:
print("训练集：",gbt.score(train_data,train_label))
print("测试集：",gbt.score(test_data,test_label))

训练集： 0.8649155722326454
测试集： 0.851528384279476


In [188]:
with open('gbt2.pickle', 'wb')as f:
    pickle.dump(gbt, f)

In [214]:
with open('svm.pickle', 'wb')as f:
    pickle.dump(classifier, f)