#### **오디오 데이터 추출**

##### **사전 준비**

라이브러리 import

In [78]:
import os
import torch
import torchaudio
import numpy as np
from transformers import ASTFeatureExtractor, ASTModel

사전 학습 모델 import

In [79]:
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
model = ASTModel.from_pretrained(pretrained_model)

GPU 사용 확인 및 모델 설정

In [80]:
print(torch.cuda.is_available()) 
print(torch.cuda.get_device_name(0)) 

True
NVIDIA GeForce RTX 3070 Ti Laptop GPU


In [81]:
model.eval() # 모델 추론 모드로 설정
model.cuda() # GPU 사용

ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTAttention(
          (attention): ASTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): ASTOutput(
          (d

##### **1. .wav 파일 로드**

In [82]:
import soundfile as sf

def load_audio(path):
    waveform, sr = sf.read(path, dtype='float32')
    waveform = torch.from_numpy(waveform).squeeze() # [T]
    return waveform, sr

##### **2. Spectrogram 변환**

In [83]:
def extract_spectrogram(waveform, sr=16000):
    inputs = feature_extractor(
        waveform,
        sampling_rate=sr,
        return_tensors="pt"
    )
    return {k: v.cuda() for k, v in inputs.items()}

##### **3. ASTModel에 입력 & 4. [CLS] 임베딩 추출**

In [84]:
@torch.no_grad()
def forward_ast(inputs):
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # shape: [1, 768]
    return cls_embedding

##### **5. 128-dim Linear Projection & 6. .npy 저장**

In [92]:
projector = torch.nn.Linear(768, 128).cuda()
projector.eval()

def save_feature(cls_embedding, save_path):
    feature_128 = projector(cls_embedding)  # shape: [1, 128]
    feature_128 = feature_128.squeeze(0).detach().cpu().numpy()  # shape: [128]
    np.save(save_path, feature_128)
    print(f"Saved: {save_path}")

##### **테스트**

In [86]:
audio_path = r"D:\Audio\training\barking\18frames\getty-dog-barks-video-id513564656_7.wav"

#1
waveform, sr = load_audio(audio_path)
print(waveform)
print(waveform.shape)
print(len(waveform))

tensor([0.0847, 0.1185, 0.0897,  ..., 0.0000, 0.0000, 0.0000])
torch.Size([48298])
48298


In [87]:
# 2
inputs = extract_spectrogram(waveform)                
print(inputs)
inputs["input_values"].shape

{'input_values': tensor([[[-0.3234, -0.5673, -0.1904,  ...,  0.0113, -0.0695, -0.0433],
         [-0.4913, -0.4756, -0.0988,  ...,  0.0318,  0.0552,  0.0092],
         [-0.4133, -0.4458, -0.0690,  ...,  0.0063,  0.1096,  0.1186],
         ...,
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
         [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]]],
       device='cuda:0')}


torch.Size([1, 1024, 128])

In [88]:
cls_emb = forward_ast(inputs)
print(cls_emb)
print(cls_emb.shape)

tensor([[-2.3068e-01,  1.2297e+00,  3.3951e-01,  1.5516e+00, -1.9303e+00,
          2.0346e+00, -1.5030e+00,  6.1230e-01, -3.3396e-01,  5.5388e-02,
         -1.0066e+00, -3.6524e-01,  1.0403e+00,  3.6856e-01, -1.0250e+00,
         -1.7470e-01,  1.3676e+00,  1.4315e+00, -7.1780e-01,  4.3001e-01,
         -1.4533e+00,  1.0657e+00,  2.6284e+00,  8.9863e-01,  6.4372e-01,
         -4.6883e-01, -5.6326e-01, -1.0934e+00, -8.3526e-01, -6.6752e-01,
         -1.1396e+00, -5.7907e-01,  1.2624e+00, -1.2031e+00,  9.7954e-01,
          4.2562e-01, -3.0455e-02, -2.4263e+00,  1.4617e+00, -1.0295e+00,
          5.1161e-01,  8.1699e-01,  4.8264e-01, -1.0658e+00,  2.9702e+00,
         -1.9600e+00,  1.6487e+00, -6.0417e-01, -2.6044e+00, -1.5063e+00,
          2.6559e-01, -2.5305e-01, -3.4200e-02, -9.7498e-01,  3.6108e-02,
          1.0103e+00,  3.3310e+00, -9.0862e-01, -2.0811e-01, -2.3498e-01,
          5.0105e-01,  1.3976e+00, -4.8020e-01,  1.1565e+00,  1.9068e+00,
         -1.7736e+00,  7.2407e-01, -2.

In [93]:
save_path = r"D:\테스트\test.npy"
save_feature(cls_emb, save_path)

Saved: D:\테스트\test.npy


In [94]:
loaded = np.load("D:/테스트/test.npy")
print(loaded.shape)    # (128,)
print(loaded[:10]) 

(128,)
[-1.0215124e+00 -2.5498366e-01  1.1035247e+00  9.7031170e-01
  4.6447691e-02 -9.1116971e-01 -2.5522530e-02  8.0757588e-04
 -3.6133379e-01 -3.5749629e-01]
