In [None]:
import torch
import torchaudio
from torchaudio.transforms import MFCC
from torchaudio.utils import download_asset

In [None]:
# 오디오 파일 불러오기
SAMPLE_SPEECH = download_asset("tutorial-assets/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav")
waveform, sample_rate = torchaudio.load(SAMPLE_SPEECH)

# waveform 지속 시간 구하기
waveform_duration = waveform.numel() / sample_rate

print("Waveform duration:", waveform_duration, "seconds")

# 프레임 길이와 프레인 시프트 정의
frame_length = 0.025 # 25 milliseconds
frame_shift = 0.01   # 10 milliseconds

# 시퀀스 길이와 MFCC 계수의 개수 정의
sequence_length = 40
num_mfcc = 40

# MFCC 초기화
mfcc_transform = MFCC(
    sample_rate=sample_rate,
    n_mfcc=num_mfcc,
    melkwargs={'hop_length': int(frame_shift * sample_rate)}
)

# 피처 추출
frames = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

print('number of frames',frames.shape[1])
print (frames.shape)

mfcc = mfcc_transform(frames)
# MFCC 피처를 시퀀스로 모양 변경(reshape)
sequences = mfcc.unfold(1, sequence_length, int(frame_shift * sample_rate))

# 필요시 패딩 추가
num_sequences = sequences.shape[2]
if num_sequences < sequence_length:
  pad_frames = torch.zeros(
      mfcc.shape[0], num_mfcc, sequence_length - num_sequences)
  sequences = torch.cat([sequences, pad_frames], dim=1)

# 추출된 피처 및 시퀀스 출력
print("MFCC shape:", mfcc.shape)
print("Sequences shape:", sequences.shape)

100%|██████████| 106k/106k [00:00<00:00, 9.70MB/s]


Waveform duration: 3.4 seconds
number of frames 54400
torch.Size([1, 54400])
MFCC shape: torch.Size([1, 40, 341])
Sequences shape: torch.Size([1, 1, 341, 40])


