In [1]:
import os

import matplotlib.pyplot as plt

import librosa
import scipy
import scipy.signal as sig
import scipy.io.wavfile as scw

import numpy as np

import wave
import cmath as cm
import math

import default_settings


# preparation : define extract_feature.

In [2]:
def levinson(signal, order):
    x = signal
    p = order
    autocorr = np.correlate(x,x,mode='full')
    r = autocorr[len(x)-1:len(x)+p]

    a = np.zeros(p+1)
    k = np.zeros(p)
    a[0] = 1
    a[1] = -r[1] / r[0]
    k[0] = a[1]
    E = r[0] + r[1] * a[1]
    for q in range(1,p):
        k[q] = -np.sum(a[0:q+1] * r[q+1:0:-1]) / E
        U = a[0:q+2]
        V = U[::-1]
        a[0:q+2] = U + k[q] * V
        E *= 1-k[q] * k[q]

    return a, k


def preEmphasis(signal, p):
    """プリエンファシスフィルタ"""
    # 係数 (1.0, -p) のFIRフィルタを作成
    return sig.lfilter([1.0, -p], 1, signal)


def autocorr(x, nlags=None):
    N = len(x)
    if nlags == None: nlags = N
    r = np.zeros(nlags)
    for lag in range(nlags):
        for n in range(N - lag):
            r[lag] += x[n] * x[n + lag]
    return r

In [3]:
def FFT(x):
    N = x.shape[0]
    
    # Recursive processing end confirmation
    if N==1:
        return x[0]
    
    x_even = x[0:N:2]
    x_odd = x[1:N:2]
    
    # Recursive processing
    X_even = FFT(x_even)
    X_odd = FFT(x_odd)
    
    # weight
    W = []
    for t in range(N//2):
        W.append(np.exp(-1j * ((2*np.pi*t) / N)))
    W = np.array(W)
    
    X = np.zeros(N, dtype="complex")
    X[0:N//2] = X_even + W*X_odd
    X[N//2:N] = X_even - W*X_odd
    
    return X


def IFFT(X):
    N = X.shape[0]
    X = X.conjugate()
    x = FFT(X)
    return (1/N) * x.conjugate()


In [4]:
def extract_formant(x, deg, fs=22050):
    a, k = levinson(x,deg)
    
    w, h = sig.freqz(1, a)
    #fig = plt.figure()
    #ax = fig.add_subplot(111)
    #ax.plot(fs * w / 2.0 / np.pi, 20 * np.log10(np.abs(h)))
    #ax.set_xlabel('frequency [Hz]')
    #ax.set_ylabel('$1 / |A(e^{j\omega})|$ [dB]')
    #plt.show()
    
    poles = np.roots(a)
    intns = np.abs(poles)
    ff = np.angle(poles) * fs / 2.0 / np.pi
    formantfreq = ff[(ff > 10) & (ff < fs / 2.0 - 10) & (intns > 0.8)]

    return formantfreq

def extract_cepstrum(x, deg, fs=22050):
    fft_data_ori = FFT(x)
    freq_ori = np.arange(len(fft_data_ori)) * fs / len(fft_data_ori)
    pow_spec = np.log10(np.real(np.abs(fft_data_ori)**2))
    cep = IFFT(pow_spec)
    return cep[:deg]

def extract_mfcc(x, deg, fs=22050):
    if len(x) <= 2048:
        mfcc = librosa.feature.mfcc(y=x, sr=fs ,n_mfcc=deg, n_fft=len(x))
    else:
        mfcc = librosa.feature.mfcc(y=x, sr=fs ,n_mfcc=deg)
    return mfcc


In [5]:
speech_sample_dir = os.path.join(default_settings.audio_corpora_dir, 'vowel')
#output_features_dir = os.path.join(default_settings.audio_corpora_dir, 'features')
wav_files = os.listdir(speech_sample_dir)

# formant data
X = []
# label data
Y = []

for wav_file in wav_files:
    basename_without_ext = os.path.splitext(os.path.basename(wav_file))[0]
    label = basename_without_ext.split('_')[0]
    #fmt_file = os.path.join(output_features_dir, basename_without_ext + '.fmt')
    #cep_file = os.path.join(output_features_dir, basename_without_ext + '.cep')
    #mfcc_file = os.path.join(output_features_dir, basename_without_ext + '.mfcc')
    
    sampling_rate, sample_data = scw.read(os.path.join(speech_sample_dir, wav_file))
    sample_data = sample_data / 32768
    
    return_formant = extract_formant(sample_data, 12, sampling_rate)
    return_cep = extract_cepstrum(sample_data[:512], 20, sampling_rate)
    return_mfcc = extract_mfcc(sample_data, 20, sampling_rate)
    
    #np.save(fmt_file, return_formant)
    #np.save(cep_file, return_cep)
    #np.save(mfcc_file, return_mfcc)
    
    X.append(return_formant[:2])
    Y.append(label)
    

  pow_spec = np.log10(np.real(np.abs(fft_data_ori)**2))
  X[0:N//2] = X_even + W*X_odd
  X[N//2:N] = X_even - W*X_odd
  X[N//2:N] = X_even - W*X_odd


## speaker recognition

In [6]:
for data, label in zip(X, Y):
    print(data,label)

[7184.98181506 1058.80321844] a
[7475.61294175 6203.61750039] u
[7285.47373531 6397.56536644] o
[7532.36078147 6215.77122102] i
[6806.212352    359.04444981] e
[7303.02409173 4253.29764658] o
[7222.72072819 4119.38218267] a
[7483.29297215 6092.69937664] i
[6650.43320778 4102.42478121] e
[ 345.14751642 1058.4689791 ] a
[7477.97526234 6191.62327213] u
[7272.23229027 6192.8058807 ] o
[6665.24056105 4254.54384881] e
[7551.38376716 6244.49938425] o
[7659.02310526 4135.72936777] u
[ 369.85976033 1962.99881615] e
[7552.48229224 6144.36036521] i
[6817.24234034 4082.06368941] u
[ 404.5502661  1014.30755903] a
[332.96092382 690.68339724] o
[7355.28040345  378.37900282] e
[7462.17820894  295.01131441] i
[ 297.72153419 2132.56340916] i
[7342.14779435 6046.1568034 ] u
[7342.41204823 5505.35666833] a


In [7]:
## split feature data (this time, please use Formant) into 20 training data + 5 test data.
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,Y,train_size=0.8)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
print(len(x_train), type(x_train), x_train.shape)
print(len(x_test), type(x_test), x_test.shape)
print(len(y_train), type(y_train), y_train.shape)
print(len(y_test), type(y_test), y_test.shape)


20 <class 'numpy.ndarray'> (20, 2)
5 <class 'numpy.ndarray'> (5, 2)
20 <class 'numpy.ndarray'> (20,)
5 <class 'numpy.ndarray'> (5,)


In [8]:
## train SVM or logistic regression using the training data, and evaluate it with the test data.
from sklearn.svm import SVC
model = SVC(gamma='scale')
model.fit(x_train, y_train)

print('correct label = ', y_test)
print('predict label = ', model.predict(x_test))
print('correct answer rate = ', model.score(x_test, y_test))


correct label =  ['o' 'u' 'e' 'a' 'e']
predict label =  ['i' 'o' 'i' 'i' 'i']
correct answer rate =  0.0


In [8]:
## make above 2 steps into one function.
def speaker_recognition(feature_dir, n_train=20, n_test=5):
    return accuracy_train, accuracy_test

In [None]:
## perform cross validation i.e. n_train=24, n_test=1, 
## and take average of accuracies of all 25 combinations.