In [1]:
import librosa
import os
import numpy as np
import sys
from dtw import dtw
from numpy.linalg import norm
from numpy import array
import pyaudio
import wave

import heapq

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [2]:
def initialCorpus(path):
    # 音乐库位置
    audioList = os.listdir(path)

    raw_audioList = {}
    beat_database = {}

    for tmp in audioList:
        audioName = os.path.join(path, tmp)
        if audioName.endswith('.wav'):
            # 读入一维音频序列
            y, sr = librosa.load(audioName)
            # 提取 MFCC 特征
            f = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10)
            # 存入数据表
            beat_database[audioName] = f

    # 保存音乐节奏数据库
    np.save('beatDatabase_mfcc.npy', beat_database)
    
    return beat_database

In [3]:
def readCorpus(path):
    
    # 读入音乐节奏数据库
    all_data = np.load(path, allow_pickle=True)
    beat_database = all_data.item()
    
    return beat_database

In [4]:
def updateCorpus(path, dbpath):
    
    # 音乐库位置
    audioList = os.listdir(path)
    
    # 已保存序列的文件
    raw_db = readCorpus(dbPath)
    raw_files = raw_db.keys()
    
    for tmp in audioList:
        audioName = os.path.join(path, tmp)
        if audioName.endswith('.wav') and audioName not in raw_files:
            y, sr = librosa.load(audioName)
            # 提取 MFCC 特征
            f = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10)
            # 存入数据表
            beat_database[audioName] = f

    # 保存音乐节奏数据库
    np.save(dbpath, beat_database)

In [5]:
def voiceCompare_quick(dbPath, tPath):
    
    # 读入语料库
    all_data = np.load(dbPath, allow_pickle=True)
    beat_database = all_data.item()

    # 读入要识别的录音
    y, sr = librosa.load(tPath)

    # 识别录音的节奏序列
    tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
    beat_frames = librosa.feature.delta(beat_frames,mode ='nearest')
    x = array(beat_frames).reshape(-1, 1)

    # 将待识别的录音序列与语料库中语音逐一做DTW对比
    compare_result = {}
    
    for songID in beat_database.keys():
        y = beat_database[songID]
        y = array(y).reshape(-1, 1)
        
        dist = dtw(x, y).distance
        # print('两段话的差异程度为： ', songID.split("\\")[1], ": ", dist)
        
        compare_result[songID] = dist

    matched_song = min(compare_result, key=compare_result.get)
    print("最接近的录音是：", matched_song)

In [6]:
def normlize(data):
    n_mean = np.mean(data, axis=0)
    n_std  = np.std(data, axis=0)
    
    norm_data = np.divide(np.subtract(data, n_mean), n_std)
    return norm_data

In [15]:
from sklearn import preprocessing

def voiceCompare(dbPath, tPath):
    # ==== 最大检索数 ====
    aimNum = 50
    
    # 读入语料库
    all_data = np.load(dbPath, allow_pickle=True)
    beat_database = all_data.item()

    # ==== 读入要识别的录音 ====
    y, sr = librosa.load(tPath)

    # 提取录音的 MFCC 特征
    # x = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10).T  # n1 * 10
    x = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10)  # 10 * n1
    lenx = len(x[0])
    
    # 归一化
    #for i in range(0, lenx):
    #    x[i] = preprocessing.minmax_scale(x[i])
    
    # 标准化
    x = x.T
    for i in range(0, lenx):
        x[i] = normlize(x[i])
    x = x.T
    
    # window size
    windowExtend = int(lenx / 2)
    
    # ==== 将待识别的录音序列与语料库中语音逐一做DTW对比 ====
    
    # heap for [dist, 时间段，文件名]
    heap = []
    heapq.heapify(heap)  
    
    for songID in beat_database.keys():
        # 取出文件名对应的 mfcc 序列
        # y = beat_database[songID].T
        y = beat_database[songID]
        
        leny = len(y[0]) # 10 * n2 
        print(leny)
        
        # 归一化
        #for i in range(0, 10):
        #    y[i] = preprocessing.minmax_scale(y[i])
        
        # 标准化
        y = y.T
        for i in range(0, leny):
            y[i] = normlize(y[i])
        y = y.T

        for tp in range(0, leny - lenx - windowExtend):
            # *加速* 设定距离上限
            full = False  # 堆是否已满
            dist_UB = -10000  # DTW 距离上限
            overBound = False  # 是否过限
            
            if (len(heap) >= aimNum):
                full = True
                dist_UB = -heap[0][0]  # heap top (biggest) DTW dist as UB  
                
            # 计算 DTW(y[tp : tp + lenx])
            total_dist = 0
            
            for i in range(0, 10):
                # DTW dist
                total_dist += dtw(x[i], y[i][tp : tp + lenx + windowExtend], distance_only=False).distance
                
                # *加速* 超过上限直接取消
                if (full and total_dist > dist_UB):
                    overBound = True
                    break
            
            # *加速* 超过上限
            if (overBound):
                continue
            
            # 入栈
            tupleY = (-total_dist, tp, songID) # dtw 距离加负数转为大根堆
            
            heapq.heappush(heap, tupleY)
            if (len(heap) > aimNum):
                heapq.heappop(heap)
            
            print(tupleY)
            
        # end for
        
        # 处理同名短间隔问题
        
        
    return heap

In [16]:
def getTimePoint_dense(dbPath, tPath, vheap):
    res_num = 20 # 定义取出前 res_num 位的结果作为识别结果
    
    # 读入语料库
    all_data = np.load(dbPath, allow_pickle=True)
    beat_database = all_data.item()
    
    # 得到要识别的录音时长
    tTime = librosa.get_duration(filename=tPath)
    
    # 提取前 res_num 个相似的片段并输出对应时间段
    similar_n = heapq.nlargest(res_num, vheap)
    
    print("开始输出相似片段：")
    
    for i in range(0, res_num):
        music_name = similar_n[i][2]  # 录音文件名
        music_time = librosa.get_duration(filename=music_name)  # 录音时长
        
        music_pos = similar_n[i][1]  # 时间段所在帧数
        music_all = len(beat_database[music_name][0])  # 录音总帧数

        frag_st = music_time / music_all * music_pos  # 时间段起点
        frag_en = frag_st + tTime  # 时间段终点
        
        # print(music_name, music_time, music_pos, music_all, frag_st)
        # print("相似度第", i + 1, "位的为文件 ", music_name, "的 ", '%.2f' % frag_st, "到", '%.2f' % frag_en, "秒")
        
        print(music_name, ",", '%.2f' % frag_st, "秒,", '%.2f' % frag_en, "秒")


In [17]:
def getTimePoint(dbPath, tPath, vheap):
    # 读入语料库
    all_data = np.load(dbPath, allow_pickle=True)
    beat_database = all_data.item()
    
    # 得到要识别的录音时长
    tTime = librosa.get_duration(filename=tPath)
    
    heapq.nlargest(20, vheap)
    
    # ====== 对 vheap 进行去重 ======
    # 取出文件名
    name_set = set()
    for tp in vheap:
        name_set.add(tp[2])
    # print(name_set)
    
    # 合并下标差小于5的片段
    sheap = []
    for name in name_set:
        # 按下标排序
        nList = [x for x in vheap if x[2] == name]
        sortL = sorted(nList, key=lambda t:t[1])
        
        # 去重
        for tp in sortL:
            if len(sheap) < 1 or sheap[-1][2] != name or abs(sheap[-1][1] - tp[1]) > 5:
                sheap.append(tp)
            else:  
                if (sheap[-1][0] < tp[0]): 
                    sheap[-1] = tp  # 保留距离较小项

    # print(sheap)
    # 提取相似片段并输出对应时间段
    similar_n = sheap
    
    print("开始输出相似片段：")
    
    for i in range(0, len(sheap)):
        music_name = similar_n[i][2]  # 录音文件名
        music_time = librosa.get_duration(filename=music_name)  # 录音时长
        
        music_pos = similar_n[i][1]  # 时间段所在帧数
        music_all = len(beat_database[music_name][0])  # 录音总帧数

        frag_st = music_time / music_all * music_pos  # 时间段起点
        frag_en = frag_st + tTime  # 时间段终点
        
        # print(music_name, music_time, music_pos, music_all, frag_st)
        # print("相似度第", i + 1, "位的为文件 ", music_name, "的 ", '%.2f' % frag_st, "到", '%.2f' % frag_en, "秒")
        
        print(music_name, ",", '%.2f' % frag_st, "秒,", '%.2f' % frag_en, "秒")


In [21]:
# 语料库路径
corpus_path = './corpus'

# 数据表路径
dbPath = './beatDatabase_mfcc.npy';

# test file path
# testPath = './input/00430105-hou5s.wav'
testPath = './input/00415250-前5s.wav'

In [34]:
# 1 初始化语料序列库
# beatDB = initialCorpus(corpus_path)

# 2 更新语料库中新音乐文件的序列
# updateCorpus(corpus_path, dbPath)

# 3 读入语料序列库
# beat_database = readCorpus(dbPath)

vheap = voiceCompare(dbPath, testPath)

3876
(-255.48668165855668, 0, './corpus\\00415250.wav')
(-255.9389961993005, 1, './corpus\\00415250.wav')
(-256.2759751265552, 2, './corpus\\00415250.wav')
(-256.51412080579064, 3, './corpus\\00415250.wav')
(-256.7522664850261, 4, './corpus\\00415250.wav')
(-256.99041216426156, 5, './corpus\\00415250.wav')
(-257.2227352147605, 6, './corpus\\00415250.wav')
(-257.4088640695121, 7, './corpus\\00415250.wav')
(-257.5949929242637, 8, './corpus\\00415250.wav')
(-257.7811217790153, 9, './corpus\\00415250.wav')
(-257.96725063376687, 10, './corpus\\00415250.wav')
(-258.15337948851845, 11, './corpus\\00415250.wav')
(-258.33950834327004, 12, './corpus\\00415250.wav')
(-258.5256371980216, 13, './corpus\\00415250.wav')
(-258.282642338207, 14, './corpus\\00415250.wav')
(-257.17967044912837, 15, './corpus\\00415250.wav')
(-257.6674080675151, 16, './corpus\\00415250.wav')
(-258.8182894399073, 17, './corpus\\00415250.wav')
(-258.82217922784946, 18, './corpus\\00415250.wav')
(-258.6130659764316, 19, './c

(-222.745619001311, 527, './corpus\\00415250.wav')
(-217.63177940596523, 528, './corpus\\00415250.wav')
(-207.39502432544413, 529, './corpus\\00415250.wav')
(-204.82422191251698, 530, './corpus\\00415250.wav')
(-204.9553516742091, 531, './corpus\\00415250.wav')
(-204.15768182205665, 532, './corpus\\00415250.wav')
(-206.51126513185608, 533, './corpus\\00415250.wav')
(-205.9445926875125, 534, './corpus\\00415250.wav')
(-207.32279451817902, 535, './corpus\\00415250.wav')
(-207.4708689215842, 536, './corpus\\00415250.wav')
(-204.30712038532528, 537, './corpus\\00415250.wav')
(-204.44810557153704, 538, './corpus\\00415250.wav')
(-205.1457058458838, 539, './corpus\\00415250.wav')
(-206.81010471421922, 540, './corpus\\00415250.wav')
(-209.55216437250073, 541, './corpus\\00415250.wav')
(-211.959941721034, 542, './corpus\\00415250.wav')
(-214.57959889758075, 543, './corpus\\00415250.wav')
(-216.62249431524123, 544, './corpus\\00415250.wav')
(-214.2177703864656, 545, './corpus\\00415250.wav')
(-

(-168.98527847838596, 2558, './corpus\\00429126.wav')
(-165.3066658904125, 2559, './corpus\\00429126.wav')
(-166.1528913677019, 2560, './corpus\\00429126.wav')
(-168.56795638523073, 2561, './corpus\\00429126.wav')
(-171.916723927511, 2562, './corpus\\00429126.wav')
(-171.22011072098286, 2566, './corpus\\00429126.wav')
(-169.01067940986724, 2567, './corpus\\00429126.wav')
(-168.44803340735854, 2568, './corpus\\00429126.wav')
(-168.48174241653652, 2569, './corpus\\00429126.wav')
(-167.63426786946582, 2570, './corpus\\00429126.wav')
(-168.56939764978813, 2571, './corpus\\00429126.wav')
(-168.41688549617754, 2572, './corpus\\00429126.wav')
(-167.91477878490912, 2573, './corpus\\00429126.wav')
(-169.32154387230037, 2574, './corpus\\00429126.wav')
(-170.46296605966927, 3259, './corpus\\00429126.wav')
(-170.27841879181506, 3260, './corpus\\00429126.wav')
(-169.72683554641844, 3261, './corpus\\00429126.wav')
(-167.81194132306683, 3344, './corpus\\00429126.wav')
(-161.37836995461095, 3345, './c

KeyboardInterrupt: 

In [20]:
getTimePoint(dbPath, testPath, vheap)

开始输出相似片段：
./corpus\00429239.wav , 49.97 秒, 52.04 秒
./corpus\00415250.wav , 74.58 秒, 76.65 秒
./corpus\00429126.wav , 78.32 秒, 80.39 秒
./corpus\00430105.wav , 19.20 秒, 21.27 秒
./corpus\00430105.wav , 19.34 秒, 21.41 秒
./corpus\00430105.wav , 23.84 秒, 25.91 秒
./corpus\00430105.wav , 49.87 秒, 51.94 秒
./corpus\00430105.wav , 50.01 秒, 52.08 秒
./corpus\00430105.wav , 61.92 秒, 63.99 秒
./corpus\00430105.wav , 62.62 秒, 64.68 秒
./corpus\00430105.wav , 130.57 秒, 132.64 秒


In [None]:
inPara = sys.argv

if (len(inPara) < 2):
    print("请输入待识别录音文件路径！")
else:
    if (len(inPara) > 2):
        print("给定语料库路径为:", sys.argv[2])
        corpus_path = sys.argv[2]
    else:
        print("默认语料库路径为：", corpus_path)
    
    if (len(inPara) > 3):
        print("给定数据表路径为:", sys.argv[3])
        dbPath = sys.argv[3]
    else:
        print("默认数据表路径为：", dbPath)
    
    testPath = sys.argv[1]
    vheap = voiceCompare(dbPath, testPath)
    getTimePoint(dbPath, testPath, vheap)
  

In [32]:
tPath = './input/00415250-前5s.wav'

# ==== 读入要识别的录音 ====
y, sr = librosa.load(tPath)

# 提取录音的 MFCC 特征
x = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10)  # 10 * n1

print(sr)
print(len(y))
print(len(x[0]))

22050
110250
216


In [33]:
tPath = './input/00415250-前5s-0.wav'

# ==== 读入要识别的录音 ====
y, sr = librosa.load(tPath)

# 提取录音的 MFCC 特征
x = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=10)  # 10 * n1

print(sr)
print(len(y))
print(len(x[0]))

22050
122368
240
