In [145]:
import numpy as np
import librosa
import scipy
import matplotlib.pyplot as plt
import IPython.display as ipd
from scipy.linalg import solve_toeplitz, toeplitz
%matplotlib tk

In [146]:
num = 3
temp = np.zeros((num,3))
k = [1,2,3]
for i in range(num):
    temp[i] = k
print(temp)



[[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]


In [147]:
p = [1,2,3,4,5]
print(p[0:3])

[1, 2, 3]


In [148]:
## autocorrelation 함수 설정

def autocorrelation(data, k):
    if k == 0:
        return np.sum(data ** 2)
    else:
        return np.sum(data[k:] * data[:-k])    

In [149]:
def LPF(data, cutoff_freq, sr):
    from scipy import signal
    low_filter = signal.firwin(101, cutoff = cutoff_freq, fs = sr, pass_zero = 'lowpass')
    # signal.firwin(order, cutoff, fs = sr, pass_zero = '필터종류')
    # order must be odd if a passband includes the Nyquist frequency
    # pass_zero = {True, False, 'lowpass', 'bandstop', 'highpass'}
    
    filtered_signal = signal.lfilter(low_filter,[1.0],data)  
    return filtered_signal
     

In [150]:
def Center_clipping(frame_signal):

    CL = max(abs(frame_signal)) * 0.68  
    # 일반적으로 center clipping 기준 : 0.68

    for i in range(len(frame_signal)):  
        if abs(frame_signal[i]) <= CL:
            frame_signal[i] = 0
        elif frame_signal[i] > CL:
            frame_signal[i] -= CL
        elif frame_signal[i] < -CL:
            frame_signal[i] += CL

    return frame_signal

In [151]:
def Calculate_pitch(signal, sr, window, cutoff_freq, win_length, thr):
    # hyperparameter 설정

    w = win_length
    h = hop_length = int(win_length / 2) # overlap 50%
    
    # window 설정
    # hamming or ones
    window = window
    
    # * zero-padding
    if len(signal) % h != 0:
        pad = np.zeros(h - (len(signal) % h))
        new_data = np.append(signal, pad)
    else:
        new_data = signal
        
    nF = num_frame = int(len(new_data) / h) - 1 # number of frame
    
    print(f'1. win_length : {w}, hop_length : {h}, nF : {nF}')
    
    # data를 frame화 시키기 (Short-time)
    win_data = np.zeros((nF,w))
    for n in range(nF):
        win_data[n] = new_data[n * h : n * h + w] * window
        
    print(f'2. win_data.shape : {win_data.shape}')
    
    # pitch_contour 생성
    # pitch_contour : 각 frame에서 가장 높은 값을 가지는 index 입력
    pitch_contour = np.zeros(nF)
    
    for i in range(nF):
        
        # 1. Lowpass filtering
        filtered_signal = LPF(win_data[i], cutoff_freq, sr)
        
        # 2.  Center clipping
        filtered_signal = Center_clipping(filtered_signal)   
        
        # 3. autocorrelation을 통해 pitch 측정
        auto = np.zeros(w)
        for n in range(w):
            #autocorrelation 진행
            auto[n] = np.sum(filtered_signal[n:] * filtered_signal[:w-n])
        # 50Hz ~ 500Hz 사이에서 일반적으로 pitch 측정되므로 거기 사이에서 찾기
        src = int(0.002 * sr)
        end = int(0.02 * sr)
        
        max_idx = auto[src:end].argmax()
        #print(f'3. max_index of nF {i} : {max_idx}')
        threshold = auto[0] * thr
        #print(f'4. threshold : {threshold}')

        if auto[src + max_idx] >= threshold:
            # if the maximum exceeds 0.55 of the autocorrelation value at 0 delay => voiced
            pitch = np.ceil(sr/(max_idx+src))
            # time * sr = frame_length
            # => time = frame_length / sr
            # => freq = 1/time = sr/ frame_length = sr / index
            # pitch는 주파수
            pitch_contour[i] = pitch
        else:
            # => unvoiced
            pitch_contour[i] = 0
    
    return pitch_contour

In [152]:
def median_filter(signal,kernel_size):
    import math
    
    k = kernel_size
    k_down = math.trunc(k / 2)
    new_signal = np.pad(signal, (k_down,k_down)) # 양 옆 제로패딩
    pitch_median = []
    for i in range(len(signal)):
        value =  np.median(new_signal[i : i + k])
        pitch_median.append(value)
    
    return pitch_median

In [153]:
def autocorrelation(signal, k):
    if k == 0:
        return np.sum(signal ** 2)
    else:
        return np.sum(signal[k:] * signal[:-k])  

In [154]:
def Levinson_Durbin(frame_signal, p):
    # 각 frame별로 levinson_durbin을 구해보자
    # 즉, 각 frame별로 LPC를 구해보자
    # signal : sampling된 data
    # p : pth order
    # The Toeplitz matrix has constant diagonals, with c as its first column and r as its first row.
    # Right-hand side in T x = b.
    column = row = [autocorrelation(frame_signal,i) for i in range(p)]
    b = [autocorrelation(frame_signal,i) for i in range(1,p+1)] # (1 <=i <= p)이므로
    # Reuturn x
    # x : coefficients a[1],a[2],a[3],...,a[k] 즉 lpc(linear prediction coefficients)
    x = solve_toeplitz((column, row), b)
    return x


In [155]:
def STE(data, win_length, window):
    
    w = win_length
    h = hop_length = int(win_length / 2) # overlap 50%
    
    # window 설정
    # hamming or ones
    window = window
    print(f'1. data.shape ; {data.shape}')
    # * zero-padding
    if len(data) % h != 0:
        pad = np.zeros(h - (len(data) % h))  # * 부족한 만큼 0으로 이루어진 배열을 생성
        pad_data = np.append(data, pad)
        
    else:
        pad_data = data
    print(f'2. new_data.shape ; {pad_data.shape}')
        
    nF = num_frame = int(len(pad_data) / h) - 1 # number of frame
    
    print(f'3. win_length : {w}, hop_length : {h}, nF : {nF}')

    
    energy = np.zeros((nF))

    for i in range(nF):
        energy[i] = np.sum(pad_data[i * h : i * h + w] ** 2) # 배열합 => np.sum
    return energy, nF


In [156]:
def V_flag(signal, sr, window, cutoff_freq, win_length,thr):

    energy, nF = STE(signal, win_length, window)
    length = len(window)
    # window = np.hamming(512) why? win_length = 512
    print(f'1. energy.shape : {energy.shape}, nF : {nF}')
    dB_energy = 10*np.log10(energy)
    
    flag = np.zeros((nF))
    for idx in range(nF):
        if dB_energy[idx] >= 8:
            flag[idx] = 1
    return flag

In [157]:
## Load yonsei file
data, sr = librosa.load('./wav/yonsei.wav', sr = None)
data = data / max(abs(data)) * 0.7
print(f'len data : {len(data)}, sampling rate : {sr}')

len data : 29684, sampling rate : 16000


In [158]:
## Plot raw waveform of yonsei
time_axis = np.linspace(0,len(data) / sr, len(data))

plt.figure(figsize = (14,5))
plt.plot(time_axis, data)
plt.xlabel('Time [s]')
plt.ylabel('Amplitude')
plt.title('Waveform of "yonsei.wav"')
plt.grid(True)
plt.xlim(0,len(data)/sr)
plt.ylim(-2,3.5)
plt.rc('font', size=15)        # 기본 폰트 크기

In [159]:
filtered_data = LPF(data,900,sr)
print(f'len data : {len(filtered_data)}, sampling rate : {sr}')
time_axis = np.linspace(0,len(filtered_data) / sr, len(filtered_data))
plt.figure(figsize = (14,5))
plt.plot(time_axis, filtered_data)
plt.xlabel('Time [s]')
plt.ylabel('Amplitude')
plt.title('Waveform of "yonsei.wav"')
plt.grid(True)
plt.xlim(0,len(filtered_data)/sr)
plt.ylim(-1,1)
plt.rc('font', size=15) 

len data : 29684, sampling rate : 16000


In [160]:
pitch = median_filter(Calculate_pitch(data,sr, np.ones(512), 900, 512, 0.33),5)
x_axis = np.linspace(0,len(data)/ sr, len(pitch))
print(f'pitch.shape : {len(pitch)}')
plt.figure(figsize = (4,1))
plt.plot(x_axis,pitch)
plt.title('Pitch contour')
plt.xlabel('Time [s]')
plt.ylabel('Pitch [Hz]')
plt.grid(True)
plt.xlim(0,len(data)/ sr)
plt.rc('font', size=15)


1. win_length : 512, hop_length : 256, nF : 115
2. win_data.shape : (115, 512)
pitch.shape : 115


In [161]:
energy, nF = STE(data, 512, np.ones(512))
print(energy.shape)
dB_energy = 10*np.log10(energy)
x_dims = np.arange(nF)
print(f'pitch.shape : {len(dB_energy)}')
time = np.linspace(0, len(data) / sr, nF)
plt.figure(figsize = (14,5))
plt.plot(time, dB_energy)
plt.title('Short Time Energy')
plt.xlabel('Time [s]')
plt.ylabel('Energy [J]')
plt.xlim(0,len(data) / sr)
plt.grid(True)
plt.show()



1. data.shape ; (29684,)
2. new_data.shape ; (29696,)
3. win_length : 512, hop_length : 256, nF : 115
(115,)
pitch.shape : 115


In [162]:
Flag = V_flag(data, sr, np.ones(512), 900, 512, 0.33)
time = np.linspace(0, len(data) / sr, len(Flag))
plt.figure(figsize = (14,5))
plt.plot(time,Flag)
plt.title('V/UV Flag')
plt.xlabel('Time [s]')
plt.ylabel('Flag')
plt.xlim(0,len(data) / sr)
plt.grid(True)
plt.rc('font', size=15)        # 기본 폰트 크기
plt.show()


1. data.shape ; (29684,)
2. new_data.shape ; (29696,)
3. win_length : 512, hop_length : 256, nF : 115
1. energy.shape : (115,), nF : 115


In [163]:
def synthesis(signal, sr, window, p, cutoff_freq, win_length, thr, kernel):
    # signal : data(yonsei)
    # sr : 16000
    # window : hamming(512)
    # p : linear prediction order
    # overlap : 50% (=256)
    # c : 0.68 => center clipping threshold
    # th = 0.3 => threshold
    
    length = win_length
    # length => 512
    
    # padding.
    hop = int(length / 2)
    if len(signal) % hop != 0:                    
        pad = np.zeros(hop - (len(signal) % hop))  
        new_signal = np.append(signal, pad)                  
    else:
        new_signal = signal
    # new_signal : padding된 data
    
    ## pitch detection--------------
    pitch_contour = median_filter(Calculate_pitch(signal, sr, np.ones(length), cutoff_freq, length, thr),kernel)
    # pitch_contour를 뽑아냄
    # pitch contour의 경우, 각 프레임별 피치주파수를 담고있다
    # 즉, 프레임별로 피치가 다를 수 있다. 이는 피치 period 역시 다르다는 것을 암시한다.
    
    ## STE => voiced, unvoiced 구분----------------------
    energy, nF = STE(data, win_length, np.hamming(512))
    # 각 프레임마다 에너지를 구한다.
    
    dB_energy = 10 * np.log10(energy)
    
    v_flag = [hop * i for i in range(len(dB_energy)) if dB_energy[i] >= 8]
    # STE로 voiced_idx를 구하자
    
    # impulse train뽑기
    syn_signal = np.zeros((len(new_signal))) # synthesis할 signal 저장하는 array
    syn_signal_OLA = np.zeros((len(new_signal)))
    
    index = [i * hop for i in range(len(new_signal) // hop-1)]
    
    # index = num_frame * shift
    Flag = 0
    
    for idx in index:
        
        windowing = new_signal[idx:idx+len(window)]*window
        
        if idx in v_flag:
            # voiced_idx와 같은 프레임일 경우
            # 피치칸투어는 각 frame에서의 피치주파수를 가지고 있다.
            F0 = pitch_contour[int(idx / hop)]
            # F0 : pitch
            # 각 프레임은 512개의 sample
            # period를 구하자
            pitch_period = np.ceil(sr / F0)
            # 각 프레임별 피치 period index
            # 문제는 프레임 내에서 어디가 voiced 시작인지 모름
            excitation = np.zeros(length)
            # 이전에 unvoiced였으면 frame의 첫번째 샘플부터 탐색시작
            # 일단 처음엔 flag = 0
            if not Flag:
                found = False
                for k in range(length):
                    if k % pitch_period == 0:
                        excitation[k] = 1 # impulse train
                        if not found and k > hop: #hop_length보다 더 클 경우,
                            save = k - hop # 겹치는거 방지
                            found = True
                Flag = 1 # voiced이므로
                        
            # 이전에 voiced 였으면
            else:
                found = False
                for k in range(save,length):
                    if (k-save) % pitch_period ==0:
                        excitation[k] = 1
                        if not found and k > hop:
                            save = k - hop
                            found = True
        else: #Unvoiced: white noise
            sig_std = np.std(windowing)
            # np.std : 표준편차
            excitation = np.random.normal(loc = 0.0, scale = sig_std, size = windowing.shape)
            Flag = 0
            
        Freq_excitation = np.fft.fft(excitation, length)
        
        # Calculate LPC
        LPC = Levinson_Durbin(windowing , p)
        # LPC 계산
        LPC_negative = np.insert(-LPC, 0, 1)
        # print(f'LPC.shape : {LPC.shape}')
        # Gain 계산
        G_square = np.array(autocorrelation(windowing,0)) - np.sum(LPC * [autocorrelation(windowing , i) for i in range(1,p+1)]) # Voiced Gain
        G = G_square ** (1/2)
        _, transfer = scipy.signal.freqz(G, LPC_negative, worN = length, whole = True, fs = sr)
        
        F_result = Freq_excitation * transfer
        result = np.fft.ifft(F_result, length)
        syn_signal[idx:idx+length] += np.real(result)
        
        # OLA
        syn_signal_OLA[idx:idx+length] += np.real(result)*np.hanning(length)
    
    return syn_signal, syn_signal_OLA

In [164]:
# sound 출력하기
p= 32
sound, sound_OLA = synthesis(data, sr, np.hamming(512), p, 900, 512, 0.3, 5)
# data : data
# sr : sampling rate
# window = np.hamming
# p : order
# thr = 0.3
# kernel size = 5


1. win_length : 512, hop_length : 256, nF : 115
2. win_data.shape : (115, 512)
1. data.shape ; (29684,)
2. new_data.shape ; (29696,)
3. win_length : 512, hop_length : 256, nF : 115


  pitch_period = np.ceil(sr / F0)


In [165]:
## synthesized sound
t_axis = np.linspace(0,len(sound)/sr,len(sound))

plt.figure(figsize = (14,5))
plt.grid(True)
plt.xlabel('Time [s]')
plt.title('Synthesized Yonsei sound, LPC order : {}'.format(p))
plt.ylabel('Amplitude')
plt.xlim(0,len(sound)/sr)
plt.ylim(-2,3.5)
plt.plot(t_axis, sound)
plt.rc('font', size=15)

ipd.Audio(sound, rate=16000, autoplay=True)

In [166]:
## synthesized sound_OLA
t_axis = np.linspace(0,len(sound)/sr,len(sound))
    
plt.figure(figsize = (14,3))
plt.grid(True)
plt.xlabel('Time [s]')
plt.title('Synthesized Yonsei sound, LPC order : {}'.format(p))
plt.ylabel('Amplitude')
plt.ylim(-1,1.2)
plt.xlim(0,len(sound)/sr)
plt.plot(t_axis, sound_OLA)
plt.rc('font', size=15)

ipd.Audio(sound_OLA, rate=16000)

In [167]:
# 원본데이터

ipd.Audio(data, rate = 16000)