In [None]:
from typing import Any, Callable, Dict, List, Optional, Union
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from einops import repeat
from diffusers import (
    AudioLDMPipeline,
    AutoencoderKL,
    UNet2DConditionModel,
    DDIMScheduler,
)
from transformers import (
    ClapTextModelWithProjection,
    RobertaTokenizerFast,
    SpeechT5HifiGan,
)

class AudioLDM(nn.Module):
    def __init__(self, device='cuda', repo_id="cvssp/audioldm", config=None):
        super().__init__()
        self.device = torch.device(device)
        pipe = AudioLDMPipeline.from_pretrained(repo_id, use_safetensors=False)

        # Setup components and move to device
        self.pipe = pipe
        self.components = {
            'vae': (pipe.vae, AutoencoderKL),
            'tokenizer': (pipe.tokenizer, RobertaTokenizerFast),
            'text_encoder': (pipe.text_encoder, ClapTextModelWithProjection),
            'unet': (pipe.unet, UNet2DConditionModel),
            'vocoder': (pipe.vocoder, SpeechT5HifiGan),
            'scheduler': (pipe.scheduler, DDIMScheduler)
        }
        
        # Initialize and validate components
        for name, (component, expected_type) in self.components.items():
            if name in ['vae', 'text_encoder', 'unet', 'vocoder']:
                component = component.to(self.device)
            assert isinstance(component, expected_type), f"{name} type mismatch: {type(component)}"
            setattr(self, name, component)

        self.evalmode = True
        self.checkpoint_path = repo_id
        self.audio_duration = 10.24 if not config else config['duration']
        self.original_waveform_length = int(self.audio_duration * self.vocoder.config.sampling_rate)  # 10.24 * 16000 = 163840
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)  # 4
        print(f'[INFO] audioldm.py: loaded AudioLDM!')

In [11]:
from diffusers import AudioLDMPipeline
import torch

repo_id = "cvssp/audioldm"
pipe = AudioLDMPipeline.from_pretrained("cvssp/audioldm", use_safetensors=False, torch_dtype=torch.float32)
pipe = pipe.to("cuda")

prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
audio = pipe(prompt, num_inference_steps=999, audio_length_in_s=60).audios[0]



Loading pipeline components...: 100%|██████████| 6/6 [00:01<00:00,  3.94it/s]
100%|██████████| 999/999 [04:08<00:00,  4.03it/s]


In [12]:
print(audio.shape)

(960000,)


In [13]:
from IPython.display import Audio

Audio(audio, rate=16000)

In [1]:
from src.data_processing.audio_processing import AudioDataProcessor
import torch

processor = AudioDataProcessor(device='cuda', config_path='configs/audioldm.yaml')
wav = processor.read_wav_file('data/samples/A_cat_meowing.wav')
wav = processor.prepare_wav(wav)
print(wav.shape)
stft, stft_complex = processor.wav_to_stft(wav)
_wav = processor.inverse_stft(stft, stft_complex)

print(_wav.shape)

start assertions
torch.Size([1, 163840])
torch.Size([163840])
(1, 163840)


In [2]:
print(processor.spec_length)

1025


In [2]:
import numpy as np
import torch

wav = wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
_wav = _wav.cpu().numpy() if isinstance(_wav, torch.Tensor) else _wav

# 1. Mean Absolute Error (MAE)
mae = np.mean(np.abs(wav - _wav))

# 2. Pearson Correlation Coefficient (PCC)
pcc = np.corrcoef(wav, _wav)[0, 1]

# 3. Normalized Root Mean Squared Error (NRMSE)
rmse = np.sqrt(np.mean((wav - _wav) ** 2))
nrmse = rmse / (np.max(wav) - np.min(_wav) + 1e-8)  # 0 나눔 방지

print(f'MAE: {mae:.4f}, PCC: {pcc:.4f}, NRMSE: {nrmse:.4f}')

MAE: 0.0008, PCC: 1.0000, NRMSE: 0.0014


In [5]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display as ld

def plot_wav_mel(wav_arrays, sr=16000, save_path="./test/mel_compares/waveform_mel.png"):
    fig, axes = plt.subplots(2, len(wav_arrays), figsize=(4 * len(wav_arrays), 6))

    clip_duration = 10.24  # 클리핑 길이 (초)
    hop_length = 512       # Hop length 설정

    for i, wav in enumerate(wav_arrays):
        # NumPy 배열 확인 후 1D 변환
        if len(wav.shape) > 1:
            wav = wav.squeeze()

        # 샘플링 레이트 및 오디오 데이터 설정
        duration = len(wav) / sr  # 오디오 길이(초)

        # Clip to first 10.24 seconds if longer
        if duration > clip_duration:
            wav = wav[: int(clip_duration * sr)]  # 앞 10.24초만 유지

        time = np.linspace(0, len(wav) / sr, num=len(wav))

        # **Waveform 플로팅**
        axes[0, i].plot(time, wav, lw=0.5)
        axes[0, i].set_title(f"Waveform {i+1}")
        axes[0, i].set_xlabel("Time (s)")
        axes[0, i].set_ylabel("Amplitude")

        # **Mel Spectrogram 계산 및 플로팅**
        mel_spec = librosa.feature.melspectrogram(y=wav, sr=sr, n_mels=128, hop_length=hop_length)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # ✅ specshow()에 전달할 때 (n_mels, frames) 차원 확인
        ld.specshow(mel_spec_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=axes[1, i])

        axes[1, i].set_title(f"Mel Spectrogram {i+1}")

    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
assert wav.shape == _wav.shape, f"wav shape {wav.shape} and _wav shape {_wav.shape} must be same"
print(f"wav shape: {wav.shape}")
print(f"_wav shape: {_wav.shape}")

wav_paths = [
    wav,
    _wav
]

plot_wav_mel(wav_paths)

wav shape: (1, 163840)
_wav shape: (1, 163840)


In [5]:
from src.utils import calculate_sdr, calculate_sisdr

sdr = calculate_sdr(wav[:, :160000], _wav[:, :160000])
sisdr = calculate_sisdr(wav, _wav)
print(f'SDR: {sdr:.4f}, SI-SDR: {sisdr:.4f}')

SDR: 34.4974, SI-SDR: 49.7118


In [4]:
sdr = calculate_sdr(_wav, _wav)
sisdr = calculate_sisdr(_wav, _wav)
print(f'SDR: {sdr:.4f}, SI-SDR: {sisdr:.4f}')

SDR: 77.1523, SI-SDR: 98.5334


In [10]:
import numpy as np
from scipy.signal import correlate

def align_wav_signals(wav1, wav2):
    assert wav1.shape == wav2.shape, "두 신호의 길이가 같아야 합니다."
    
    # 1D 배열로 변환
    wav1, wav2 = wav1.flatten(), wav2.flatten()
    
    # Cross-Correlation 계산
    correlation = correlate(wav1, wav2, mode="full")
    shift = correlation.argmax() - (len(wav1) - 1)  # 최대 상관점의 오프셋

    # shift 만큼 이동 (앞뒤 zero-padding 적용)
    if shift > 0:
        aligned_wav2 = np.pad(wav2[shift:], (0, shift), mode="constant")
    elif shift < 0:
        aligned_wav2 = np.pad(wav2[:shift], (-shift, 0), mode="constant")
    else:
        aligned_wav2 = wav2  # 이미 정렬됨

    return aligned_wav2, shift

aligned_wav2, applied_shift = align_wav_signals(wav, _wav)
sdr = calculate_sdr(wav, aligned_wav2)
sisdr = calculate_sisdr(wav, aligned_wav2)
print(f"적용된 시간 shift: {applied_shift} samples")
print(f'SDR: {sdr:.4f}, SI-SDR: {sisdr:.4f}')


적용된 시간 shift: 0 samples
SDR: 34.4945, SI-SDR: 49.7118
