In [1]:
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.signal import get_window
import scipy
%matplotlib tk
from numpy.fft import fft, fftshift
from matplotlib import colors as mcolors
import colorsys

In [2]:
data1, sr1 = librosa.load('./wav/s.wav', sr = None) # /s/ unvoiced 추출
data2, sr2 = librosa.load('./wav/a.wav', sr = None) # /a/ voiced 추출

length = [64,128,256,512]
frame_length = length[0]
hop_length = int(frame_length / 2)

In [3]:
def hamming_window(data, frame_length):
    # Apply hhaming window
    hamming = np.hamming(frame_length)
    signal = data[frame_length:frame_length + frame_length] * hamming
    fft = np.fft.fft(signal,2048)
    # fft하면 polar form형태가 되므로 magnitude와 phase로 구분할 수 있다.
    magnitude = np.abs(fft)
    fft_shift = magnitude[:int(len(magnitude) / 2)]
    log_magnitude = 20 * np.log10(fft_shift)
    return log_magnitude


In [4]:
def rectangular_windw(data, frame_length):
    # apply hamming window
    signal = data[frame_length:frame_length + frame_length]
    fft = np.fft.fft(signal,2048)
    # fft하면 polar form 형태가 되므로 magnitude와 phase로 구분할 수 있다.
    magnitude = np.abs(fft)
    fft_shift = magnitude[:int(len(magnitude) / 2)]
    log_magnitude = 20 * np.log10(fft_shift)
    return log_magnitude

In [5]:
hamming_magnitude = hamming_window(data2, frame_length)
hamming_magnitude.shape
rectangular_magnitude = rectangular_windw(data2, frame_length)

In [6]:
freq_axis = np.linspace(0,8000,len(hamming_magnitude))
plt.figure(figsize = (14,5))
plt.plot( freq_axis,hamming_magnitude, color = 'g', label = 'hamming')
plt.xlabel('Frequency [Hz]')
plt.ylabel('Magnitude [dB]')
plt.title('Window size {}'.format(frame_length))
plt.grid(True)
plt.xlim(0,8000)
plt.plot(freq_axis,rectangular_magnitude, color = 'r', label = 'rectangular')
plt.xlabel('Frequency [Hz]')
plt.ylabel('Magnitude [dB]')
plt.rc('font', size=15)
plt.legend()


<matplotlib.legend.Legend at 0x1be2d0ee5e0>

In [22]:
signal, rate = librosa.load('D:/yonsei/alchemist/220824/RJH_1_enhanced_cut.wav', sr = None)

print(f'data length : {len(signal)}, rate = {rate}')


data length : 129967, rate = 16000


In [23]:
length = [64,128,256,512]
frame_length = length[3]
hop_length = int(frame_length / 2)

In [24]:
# Plot the spectrogram (magnitude of the STFT) of the whole 연세대학교 signal in various conditions and discuss the results.
# STFT 함수
def STFT(data, frame_length, hop_length):
    L = int(frame_length)
    R = int(hop_length)
    hamming = np.hamming(int(L))
    num_frame = int(len(data) / R) - 1
    N = [128,256,512,1024,2048] # Number of uniformly space frequencies across the interval 0<=w<= 2pi
    x = np.zeros((num_frame, L))
    fft = np.zeros((num_frame,N[4]))
    # 일반적으로 dft number는 window length보다 같거나 길어야한다!
    # 길면 길수록 frequency resolution이 더 좋아지고 짧으면 time-resolution이 더 좋아지는 trade-off한 관계성을 가진다.
    for i in range(num_frame): # windowing을 합시다
        # Form the sequence xn[m] = x[n-m]w[m] for m = 0,1,...,L-1
        x[i] = data[i * R : i * R + L] * hamming
        #x[i] = data[i * R + L: i * R :-1] * hamming
        # Compute the complex conjugate of the N-point DFT of the sequence xn[m]
        # (This can be done efficiently with an N-point FFT algorithm)
        fft[i] = np.fft.fft(x[i],N[4])
        # 여기서 fft란? window씌운 time domaing sequence를 frequency domain에서 몇개의 주파수로 sampling할 것인가(나타낼 것인가)
        # 이때 fft는 2pi의 주기성을 가지는 polar form(즉 exponential한 주파수 분석결과로 나오기에 magnitude와 phase에 대한 정보가 담긴다)
        # 그럼 왜 short time fourier transform을 하느냐?
        # 그것은 우리의 음성특징에 기인한다. 일반적으로 우리의 음성은 quasi-stationary한데, 즉 주기성을 갖긴 갖지만 변동성을 가질 수도 있기에
        # 준주기적이라고 한다. 따라서 짧은 시간 동안에 분석을 할 때 음성의 특징이 변하지 않는다고 가정하기에 short time으로 주파수분석을
        # 하게 된다.
    return x ,fft




In [25]:
x, fft = STFT(signal, frame_length, hop_length)
print(f'x shape : {x.shape}, fft shape : {fft.shape}')
# x: time domain windoing 결과
# fft : frequency domain dft결과

x shape : (506, 512), fft shape : (506, 2048)


  fft[i] = np.fft.fft(x[i],N[4])


In [26]:
time = np.linspace(0,len(signal)/rate, len(signal))
plt.figure(figsize = (10,3))
plt.plot(time,signal)
plt.xlabel('time(s)')
plt.ylabel('Amplitude')
plt.title('Original waveform of yonsei university')
plt.grid(True)
plt.xlim(0,len(signal)/rate)
plt.ylim(-0.8,1)

(-0.8, 1.0)

In [27]:
spectrogram = np.abs(fft)
fft_shift = spectrogram[:,:int(spectrogram.shape[1]/2)]
log_spectrogram = 20*np.log10(fft_shift)
print("fft shape : ", fft.shape)
print("spectrogram shape : ", log_spectrogram.shape)

fft shape :  (506, 2048)
spectrogram shape :  (506, 1024)


In [28]:
time1 = np.linspace(0,len(signal)/ rate,rate)

cmap = plt.cm.get_cmap("inferno")

plt.figure(figsize=(8,3))

plt.imshow(log_spectrogram.T, origin = 'lower', aspect = 'auto', cmap = cmap, extent=[0,len(signal)/rate,0,8000],vmin=-80,vmax=20)
plt.xlabel('Time[s]')
plt.ylabel('Frequency[Hz]')
plt.title('Spectrogram with window size {}'.format(frame_length))
plt.colorbar(format = "%+2.0f dB")

<matplotlib.colorbar.Colorbar at 0x1be30f57910>

In [18]:
hop_length =512
n_fft = 1024

hop_length_duration = float(hop_length) / rate
n_fft_duration = float(n_fft) / rate

stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)

magnitude = np.abs(stft)

log_spectrogram = librosa.amplitude_to_db(magnitude)

plt.figure()
librosa.display.specshow(log_spectrogram, sr=rate, hop_length=hop_length, cmap = cmap, vmin=-80,vmax=20)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram (dB)")

Text(0.5, 1.0, 'Spectrogram (dB)')

In [None]:
temp = [0,1,2,3,4,5,6,7,8,9]
a = temp[8:0:-1]
print(a)

[8, 7, 6, 5, 4, 3, 2, 1]
