In [134]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
%matplotlib tk
from scipy.fftpack import dct

In [380]:
data, sr = librosa.load('./wav/yonsei.wav', sr = None)
print(f'data length = {len(data)}, sr = {sr}')

data length = 29684, sr = 16000


In [398]:
frame_length = 512
#frame_length = int(20/1000 * sr)
#hop_length = int(10/1000 * sr)
hop_length = 256
num_frame = int(len(data) / hop_length) - 1

print(f'frame length = {frame_length}, hop_length = {hop_length}, num_frame = {num_frame}')

frame length = 512, hop_length = 256, num_frame = 114


In [399]:
def DFT(data, frame_length):
    L = frame_length + 1
    R = int(L / 2)
    N = frame_length
    num_frame = int(len(data) / R) - 1
    hamming = np.hamming(L)
    time_signal = np.zeros((num_frame, L))
    freq_signal = np.zeros((num_frame, 1025))
    for i in range(num_frame):
        time_signal[i] = data[i * R : i * R + L] * hamming
        freq_signal[i] = np.fft.rfft(time_signal[i],2048)
        
    return time_signal, freq_signal


In [400]:
time_signal, freq_signal = DFT(data, frame_length)
print(f'freq_signal.shape = {freq_signal.shape}')

freq_signal.shape = (114, 1025)


  freq_signal[i] = np.fft.rfft(time_signal[i],2048)


In [401]:
#freq_signal = freq_signal[:,:int(freq_signal.shape[1]/2) + 1]
print(f'freq_siganl.shape : {freq_signal.shape}')
spectrogram = np.abs(freq_signal)
print(f'spectrogram.shape : {spectrogram.shape}')


freq_siganl.shape : (114, 1025)
spectrogram.shape : (114, 1025)


In [402]:
# Mel filterbank형성
mel_fb = librosa.filters.mel(sr = sr, n_fft = 2048,n_mels =80)
# return 값 : M = np.ndarray [shape=(n_mels, 1 + n_fft/2)] Mel transform matrix
print(f'Mel filterbank shape : {mel_fb.shape}')

Mel filterbank shape : (80, 1025)


In [404]:
# Test용 Mel filtergram plotting하기
fig, ax = plt.subplots()
'''subplots()에선 두개의 값을 받을 수 있는데 figure 와 axes 값을 받을 수 있다.
여기서 변수명은 상관없다. 순서가 중요하다
fig란 figure로써 - 전체 subplot을 말한다. ex) 서브플로안에 몇개의 그래프가 있던지 상관없이  그걸 담는 하나.
 전체 사이즈를 말한다.
ax는 axe로써 - 전체 중 낱낱개를 말한다 ex) 서브플롯 안에 2개(a1,a2)의 그래프가 있다면 a1, a2 를 일컬음'''
img = librosa.display.specshow(mel_fb, x_axis='linear', ax=ax)
ax.set(ylabel='Mel filter', title='Mel filter bank')
fig.colorbar(img, ax=ax)

<matplotlib.colorbar.Colorbar at 0x2567783a7f0>

In [405]:
plt.figure(figsize=(15,4))
for n in range(mel_fb.shape[0]):
    plt.plot(mel_fb[n])
plt.xlabel('Frequency [Hz]')
plt.ylabel('Mel-filterbank coefficients')
plt.title('Mel-filterbank function')
plt.xlim(0,mel_fb.shape[1])

(0.0, 1025.0)

In [406]:
mel_spectrogram = np.dot(spectrogram, mel_fb.T)
db_spectrogram = 20 * np.log10(abs(mel_spectrogram))
time1 = np.linspace(0,len(data)/ sr,sr)

cmap = plt.cm.get_cmap("jet")
print(f'db_spectrogram.shape = {db_spectrogram.shape}')

plt.figure(figsize=(14,8))

plt.imshow(db_spectrogram.T, origin = 'lower', aspect = 'auto', cmap = cmap, extent=[0,len(data)/sr,0,8000])
plt.xlabel('Time[s]')
plt.ylabel('Frequency[Hz]')
plt.colorbar(format = "%+2.0f dB")

db_spectrogram.shape = (114, 80)


<matplotlib.colorbar.Colorbar at 0x256796719a0>

In [144]:
# Version 1 : power spectrogram을 먼저구한다.
# power spectrogram (power periodogram)
power = np.zeros((num_frame, freq_signal.shape[1]))
for i in range(num_frame):
    power[i] = np.array(abs(freq_signal[i]) ** 2) / (freq_signal.shape[1]) # normalize
print(power.shape)

#power_spectrogram = power[:,:int(frame_length/2)+ 1]
print(f'power_spectrogram : {power.shape}')

# mel filter와 power 간에 행렬곱을 실시한다. => np.dot

(184, 1025)
power_spectrogram : (184, 1025)


In [407]:
 # Version 2 # power spectrogram을 나중에 구한다
def Melfilter(freq_sigal,mel_fb,n_filter,num_frame):
    R = n_filter
    A = np.zeros(R)
    mel = np.zeros((num_frame, R))
    for r in range (R):
        summation = 0
        for k in range(mel_fb.shape[1]):
           summation += abs(mel_fb[r][k]) ** 2
        A[r] = summation
     # normalize the mf
    for m in range(freq_signal.shape[0]):
        for r in range (R):
            summation = 0
            for k in range(mel_fb.shape[1]):
                summation += abs(mel_fb[r][k] * freq_signal[m][k]) ** 2
            mel[m][r] = (1/A[r]) * summation
    return mel

mel = Melfilter(spectrogram, mel_fb, 80, num_frame)
print(f'mel.shape : {mel.shape}')

mel.shape : (114, 80)


In [408]:
time1 = np.linspace(0,len(data)/ sr,sr)

cmap = plt.cm.get_cmap("jet")

plt.figure(figsize=(8,3))

plt.imshow(20 * np.log10(mel.T + 1e-12), origin = 'lower', aspect = 'auto', cmap = cmap, extent=[0,len(data)/sr,0,8000])
plt.xlabel('Time[s]')
plt.title('Melspectrogram')
plt.ylabel('Frequency[Hz]')
plt.rc('font', size=13)
plt.colorbar(format = "%+2.0f dB")
plt.show()

In [409]:
# Calculate MFCC
def cal_MFCC(mel,num_frame):
    N = 12 # num of mfcc coefficients
    R = 80
    mfcc = np.zeros((num_frame, N))
    for i in range(num_frame):
        for n in range(N):
            summation = 0
            for r in range(R):
                summation += np.log10(mel[i][r])* np.cos(((np.pi) / R) * (r + 1/2) * n)
            mfcc[i][n] =  summation
    return mfcc

In [410]:
mfcc = cal_MFCC(mel,num_frame)
print(f'mfcc shape : {mfcc.shape}')

mfcc shape : (114, 12)


In [411]:
# Plot mfcc

time1 = np.linspace(0,len(data)/ sr,sr)

cmap = plt.cm.get_cmap("jet")

plt.figure(figsize=(8,3))

plt.imshow(mfcc.T, origin = 'lower', aspect = 'auto', cmap = cmap,interpolation = None, 
            extent=[0,len(data)/sr,0,mfcc.shape[1]],vmin = -550, vmax = 180)
plt.xlabel('Time[s]')
plt.title('MFCC')
plt.rc('font', size=13)
plt.ylabel('Cepstrum coefficient')
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x25680b31f10>

In [334]:
mfccs = librosa.feature.mfcc(y=data, sr=sr, n_mfcc=12, n_fft = 2048, n_mels = 80, hop_length = 160, win_length = 320)
plt.figure(figsize=(8,3))
plt.imshow(mfccs, origin = 'lower', aspect = 'auto', cmap = cmap, interpolation = None,extent=[0,len(data)/sr,0,12])
plt.xlabel('Time[s]')
plt.ylabel('Cepstrum coefficient')
plt.title('MFCC')
plt.rc('font', size=13)
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x2566aa91af0>

In [325]:
# plot the melscale
mel_scale = np.zeros(10000)
for i in range(10000):
    mel_scale[i] = 2585 * np.log10(1+ (i/700))

plt.figure(figsize = (8,8))
plt.plot(mel_scale)
plt.xlabel('Frequency [Hz]')
plt.ylabel('Mel scale')
plt.title('Mel Function')
plt.grid(True)
plt.xlim(0,10000)
plt.rc('font', size=13)
plt.ylim(0,mel_scale[9999])


(0.0, 3061.2737063000363)