In [1]:
import os
import sys
import glob
import librosa
import numpy as np

import default_settings

## Define function

In [2]:
def euclidDist(a, b):
    return np.sqrt(np.power(a-b, 2).sum())

def load_wav(wav_path, sampling_rate=44100):
    signal, fs = librosa.load(wav_path, sr=sampling_rate)
    return signal, fs

def minVal(v1, v2, v3):
    '''
    v1, v2, v3 (tuple): answer to the minimum meanSquaredError and the index. 
    '''
    if firstIndex(v1) <= min(firstIndex(v2), firstIndex(v3)):
        return v1, 0
    elif firstIndex(v2) <= firstIndex(v3):
        return v2, 1
    else:
        return v3, 2 

def firstIndex(x):
    '''
    x (array): answer to First dimension of the array. 
    '''
    return x[0]

def secondIndex(x):
    '''
    x (array): answer to Second dimension of the array. 
    '''
    return x[1]

def calc_dtw_mfcc(first_mfcc, second_mfcc):
    # get the frame of the audio
    F_len = len(first_mfcc)
    S_len = len(second_mfcc)

    # Array initialization (Rows:F_len, Colums:S_len)
    m = [[0 for j in range(S_len)] for i in range(F_len)]
    m[0][0] = (euclidDist(first_mfcc[0], second_mfcc[0]), (-1,-1))
    
    for i in range(1 ,F_len):
        m[i][0] = (m[i-1][0][0] + euclidDist(first_mfcc[i], second_mfcc[0]), (i-1,0))
    for j in range(1, S_len):
        m[0][j] = (m[0][j-1][0] + euclidDist(first_mfcc[0], second_mfcc[j]), (0,j-1))
    
    # Insert a tuple into Empty Cells
    for i in range(1, F_len):
        for j in range(1, S_len):
            minimum, index = minVal(m[i-1][j], m[i][j-1], m[i-1][j-1])
            indexes = [(i-1,j), (i,j-1), (i-1,j-1)]
            m[i][j] = (firstIndex(minimum) + euclidDist(first_mfcc[i], second_mfcc[j]), indexes[index])
    
    return m

def extract_mfcc(x, deg, fs):
    n_fft = fs * 16 // 1000
    hop_length = fs * 8 // 1000
    mfcc = librosa.feature.mfcc(y=x, sr=fs ,n_mfcc=deg, n_fft=n_fft, hop_length=hop_length)
    
    return mfcc


## Read audio data

In [3]:
jpn_source_digit_dir = default_settings.jpn_source_digit_dir
source_wav_paths = glob.glob(os.path.join(jpn_source_digit_dir.replace('digit', 'digit_trimmed'), '*.wav'))

jpn_target_digit_dir = default_settings.jpn_target_digit_dir
target_wav_paths = glob.glob(os.path.join(jpn_target_digit_dir.replace('digit', 'digit_trimmed'), '*.wav'))

In [4]:
source_wav_mfccs = [0 for i in range(len(source_wav_paths))]
for  i, source_wav_path in enumerate(source_wav_paths[:], start=0):
    basename_without_ext = os.path.splitext(os.path.basename(source_wav_path))[0]
    file_index = basename_without_ext.replace('jpn_', '').replace('_3', '')
    signal, fs = load_wav(source_wav_path, sampling_rate=16000)
    mfcc = extract_mfcc(signal, 20, fs)
    source_wav_mfccs[i] = (mfcc[1:], int(file_index))



In [5]:
target_wav_mfccs = [0 for i in range(len(target_wav_paths))]
for  i, target_wav_path in enumerate(target_wav_paths[:], start=0):
    basename_without_ext = os.path.splitext(os.path.basename(target_wav_path))[0]
    file_index = basename_without_ext.replace('jpn_', '').replace('_3', '')
    signal, fs = load_wav(target_wav_path, sampling_rate=16000)
    mfcc = extract_mfcc(signal, 20, fs)
    target_wav_mfccs[i] = (mfcc[1:], int(file_index))

### distance by mfcc

In [6]:
for i, source_wav_mfcc in enumerate(source_wav_mfccs[:], start=0):
    index_list = [0 for x in range(len(target_wav_mfccs))]
    for j, target_wav_mfcc in enumerate(target_wav_mfccs[:], start=0):
        dist_mfcc = calc_dtw_mfcc(source_wav_mfcc[0].T, target_wav_mfcc[0].T)
        index_list[j] = [dist_mfcc[-1][-1][0], source_wav_mfcc[1], target_wav_mfcc[1]]
        print("source: ", source_wav_mfcc[1], ", target: ", target_wav_mfcc[1], ", wave distance: ", dist_mfcc[-1][-1][0])
    index_min = np.argmin(index_list, axis=0)
    print("min comb = [source: ", index_list[index_min[0]][1], ", target: ",index_list[index_min[0]][2], "]")
    print("-----")

source:  9 , target:  9 , wave distance:  13167.26
source:  9 , target:  6 , wave distance:  24000.492
source:  9 , target:  8 , wave distance:  28763.682
source:  9 , target:  1 , wave distance:  26606.074
source:  9 , target:  4 , wave distance:  20111.691
source:  9 , target:  2 , wave distance:  19791.367
source:  9 , target:  5 , wave distance:  25406.58
source:  9 , target:  3 , wave distance:  24351.426
source:  9 , target:  10 , wave distance:  17019.807
source:  9 , target:  7 , wave distance:  27910.424
min comb = [source:  9 , target:  9 ]
-----
source:  6 , target:  9 , wave distance:  23657.152
source:  6 , target:  6 , wave distance:  11863.57
source:  6 , target:  8 , wave distance:  28164.201
source:  6 , target:  1 , wave distance:  28431.22
source:  6 , target:  4 , wave distance:  20038.26
source:  6 , target:  2 , wave distance:  20060.768
source:  6 , target:  5 , wave distance:  23285.172
source:  6 , target:  3 , wave distance:  29318.693
source:  6 , target:  10