In [1]:
# default
import time
import os
import glob
import uuid
import concurrent
import concurrent.futures
import requests
import json
import datetime
import soundfile as sf
import io
# optional
from pathlib import Path
import numpy as np
import torch
from einops import repeat
from torch import Tensor
from torch.nn import functional as F
from transformers import AutoTokenizer
from vocos import get_voco
from model.module import AudioBoxModule
from torchode.interface import solve_ivp
import torchaudio
import librosa

  warn(


In [2]:
# model functions
class Infer:
    def __init__(self, path: Path):
        self.device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )
        self.model = AudioBoxModule.load_from_checkpoint(path).to(self.device)
        self.model.eval()
        self.voco = get_voco(self.model.voco_type).to(self.device)

        self.tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
        self.tokenizer.padding_side = "right"

        self.steps = 64
        self.alpha = 3.0

    @torch.no_grad()
    @torch.autocast(device_type="cuda", enabled=False)
    def encode_text(self, texts: list[str]) -> tuple[Tensor, Tensor]:
        batch_encoding = self.tokenizer(
            [text + self.tokenizer.eos_token for text in texts],
            add_special_tokens=False,
            return_tensors="pt",
            max_length=127,
            truncation="longest_first",
            padding="max_length",
        )
        phoneme = batch_encoding.input_ids.to(self.device)
        phoneme_mask = batch_encoding.attention_mask.to(self.device) > 0
        phoneme_emb = self.model.t5(
            input_ids=phoneme, attention_mask=phoneme_mask
        ).last_hidden_state

        return phoneme_emb, phoneme_mask

    # @torch.no_grad()
    # @torch.autocast(device_type="cuda", enabled=False)
    # def clap_rank(self, audios: Tensor, texts: list[str]) -> Tensor:
    #     audios = audios[:, : self.clap_audio_len].mean(dim=-1)
    #     audios = audios.float()
    #     text_embed = self.clap.get_text_embeddings(texts)
    #     audio_embed = self.clap.clap.audio_encoder(audios)[0]

    #     similarity = F.cosine_similarity(text_embed, audio_embed)
    #     args = torch.argsort(similarity, dim=0, descending=True)
    #     return args

    @torch.no_grad()
    @torch.autocast(device_type="cuda")
    def generate(
        self, texts: list[str], dur: float, cfg=3.0
    ) -> list[np.ndarray]:
        phoneme_emb, phoneme_mask = self.encode_text(texts)
        batch_size = phoneme_emb.shape[0]

        target_len = round(self.model.sampling_rate * dur)
        latent_len = self.voco.encode_length(target_len)
        audio_mask = torch.ones(
            batch_size, latent_len, dtype=torch.bool, device=self.device
        )
        audio_context = torch.zeros(
            batch_size, latent_len, self.voco.latent_dim, device=self.device
        )

        if latent_len < 192:
            audio_mask = F.pad(audio_mask, (0, 192 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 192 - latent_len))
        elif 192 < latent_len < 384:
            audio_mask = F.pad(audio_mask, (0, 384 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 384 - latent_len))
        elif 384 < latent_len < 768:
            audio_mask = F.pad(audio_mask, (0, 768 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 768 - latent_len))
        elif 768 < latent_len < 1536:
            audio_mask = F.pad(audio_mask, (0, 1536 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 1536 - latent_len))

        def fn(t: Tensor, y: Tensor):
            out = self.model.audiobox.cfg(
                w=y,
                context=audio_context,
                times=t,
                alpha=cfg,
                mask=audio_mask,
                phoneme_emb=phoneme_emb,
                phoneme_mask=phoneme_mask,
            )
            return out

        y0 = torch.randn_like(audio_context)
        t = torch.linspace(0, 1, self.steps, device=self.device)

        t = repeat(t, "n -> b n", b=batch_size)
        sol = solve_ivp(
            fn,
            # torch.compile(fn, dynamic=False),
            y0,
            t,
            method_class=self.model.method, #self.model.torchode_method_klass,
        )
        sampled_audio = sol.ys[-1]

        sample = self.voco.decode(sampled_audio)
        sample = sample[:, :target_len]

        sample = sample / sample.abs().amax(dim=1, keepdim=True).clamp_min(1)
        # args = self.clap_rank(sample, texts)
        # sample = sample[args]
        # sample = sample[:cutoff]
        sample = sample.detach().cpu().numpy().astype(np.float32)

        return [audio for audio in sample]

    @torch.no_grad()
    @torch.autocast(device_type="cuda")
    def variation(
        self, audios: list[np.ndarray], texts: list[str], dur: float, corrupt: float, sr: list[int], cfg_score: int = 3.0
    ) -> list[np.ndarray]:
        phoneme_emb, phoneme_mask = self.encode_text(texts)
        batch_size = phoneme_emb.shape[0]

        audios = [audio / np.iinfo(audio.dtype).max for audio in audios]
        audio_tensor = torch.from_numpy(np.stack(audios, axis=0)).to(self.device)
        audio_tensor = audio_tensor.float()
        ##
        audio_tensor = audio_tensor.transpose(1, 2)
        audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=sr[0], new_freq=self.voco.sampling_rate)
        audio_tensor = audio_tensor.transpose(1, 2)
        if audio_tensor.shape[2] == 1:
            audio_tensor = audio_tensor.repeat(1, 1, 2)
        elif audio_tensor.shape[2] > 2:
            audio_tensor = audio_tensor[:, :, :2]
        target_len = audio_tensor.shape[1]
        latent_len = self.voco.encode_length(target_len)
        audio_enc = self.voco.encode(audio_tensor)
        audio_mask = torch.ones(
            batch_size, latent_len, dtype=torch.bool, device=self.device
        )
        audio_context = torch.zeros(
            batch_size, latent_len, self.voco.latent_dim, device=self.device
        )

        if latent_len < 192:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 192 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 192 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 192 - latent_len))
        elif 192 < latent_len < 384:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 384 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 384 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 384 - latent_len))
        elif 384 < latent_len < 768:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 768 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 768 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 768 - latent_len))
        elif 768 < latent_len < 1536:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 1536 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 1536 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 1536 - latent_len))

        sigma = 1e-3
        c = 1.0 - corrupt
        noised_enc = (audio_enc * c) + torch.randn_like(audio_enc) * (1 - (1 - sigma) * c)
        corrupt_tensor = torch.tensor(1 - corrupt).to(self.device)
        # print("corrupt_tensor : ", corrupt_tensor)

        def forward(t: Tensor, y: Tensor):
            # print("times : ", t)
            out = self.model.audiobox.cfg(
                w=y,
                context=audio_context,
                # times=t,
                times=t,
                alpha=cfg_score,
                mask=audio_mask,
                phoneme_emb=phoneme_emb,
                phoneme_mask=phoneme_mask,
            )
            return out

        # t = torch.linspace(c, 1, 64, device=self.device)
        t = torch.linspace(0, corrupt, self.steps, device=self.device)

        t = repeat(t, "n -> b n", b=batch_size)
        sol = solve_ivp(
            # torch.compile(forward, dynamic=False),
            forward,
            noised_enc,
            t+corrupt_tensor, # 0.6 ~ 1.0
            method_class=self.model.method #.torchode_method_klass,
        )
        sampled_audio = sol.ys[-1]

        sample = self.voco.decode(sampled_audio)
        new_target_len = round(self.model.sampling_rate * dur)
        sample = sample[:, :new_target_len]

        sample = sample / sample.abs().amax(dim=1, keepdim=True).clamp_min(1)
        sample = sample.detach().cpu().numpy().astype(np.float32)

        return [audio for audio in sample]
    
    @torch.no_grad()
    @torch.autocast(device_type="cuda")
    def variation_negative(
        self, audios: list[np.ndarray], texts: list[str], negative_texts:list[str], dur: float, corrupt: float, sr: list[int], cfg_score: int = 3.0, nalpha: int = 1.0
    ) -> list[np.ndarray]:
        phoneme_emb, phoneme_mask = self.encode_text(texts)
        negative_phoneme_emb, negative_phoneme_mask = self.encode_text(negative_texts)
        batch_size = phoneme_emb.shape[0]

        audios = [audio / np.iinfo(audio.dtype).max for audio in audios]
        audio_tensor = torch.from_numpy(np.stack(audios, axis=0)).to(self.device)
        audio_tensor = audio_tensor.float()
        ##
        audio_tensor = audio_tensor.transpose(1, 2)
        audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=sr[0], new_freq=self.voco.sampling_rate)
        audio_tensor = audio_tensor.transpose(1, 2)
        if audio_tensor.shape[2] == 1:
            audio_tensor = audio_tensor.repeat(1, 1, 2)
        elif audio_tensor.shape[2] > 2:
            audio_tensor = audio_tensor[:, :, :2]
        target_len = audio_tensor.shape[1]
        latent_len = self.voco.encode_length(target_len)
        audio_enc = self.voco.encode(audio_tensor)
        audio_mask = torch.ones(
            batch_size, latent_len, dtype=torch.bool, device=self.device
        )
        audio_context = torch.zeros(
            batch_size, latent_len, self.voco.latent_dim, device=self.device
        )

        if latent_len < 192:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 192 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 192 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 192 - latent_len))
        elif 192 < latent_len < 384:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 384 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 384 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 384 - latent_len))
        elif 384 < latent_len < 768:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 768 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 768 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 768 - latent_len))
        elif 768 < latent_len < 1536:
            audio_enc = F.pad(audio_enc, (0, 0, 0, 1536 - latent_len))
            audio_mask = F.pad(audio_mask, (0, 1536 - latent_len))
            audio_context = F.pad(audio_context, (0, 0, 0, 1536 - latent_len))

        sigma = 1e-3
        c = 1.0 - corrupt
        noised_enc = (audio_enc * c) + torch.randn_like(audio_enc) * (1 - (1 - sigma) * c)
        corrupt_tensor = torch.tensor(1 - corrupt).to(self.device)
        # print("corrupt_tensor : ", corrupt_tensor)

        def forward(t: Tensor, y: Tensor):
            # print("times : ", t)
            out = self.model.audiobox.cfg_negative(
                w=y,
                context=audio_context,
                # times=t,
                times=t,
                alpha=cfg_score,
                mask=audio_mask,
                phoneme_emb=phoneme_emb,
                phoneme_mask=phoneme_mask,
                negative_phoneme_emb=negative_phoneme_emb,
                negative_phoneme_mask=negative_phoneme_mask,
                nalpha=nalpha
            )
            return out

        # t = torch.linspace(c, 1, 64, device=self.device)
        t = torch.linspace(0, corrupt, self.steps, device=self.device)

        t = repeat(t, "n -> b n", b=batch_size)
        sol = solve_ivp(
            # torch.compile(forward, dynamic=False),
            forward,
            noised_enc,
            t+corrupt_tensor, # 0.6 ~ 1.0
            method_class=self.model.method #.torchode_method_klass,
        )
        sampled_audio = sol.ys[-1]

        sample = self.voco.decode(sampled_audio)
        new_target_len = round(self.model.sampling_rate * dur)
        sample = sample[:, :new_target_len]

        sample = sample / sample.abs().amax(dim=1, keepdim=True).clamp_min(1)
        sample = sample.detach().cpu().numpy().astype(np.float32)

        return [audio for audio in sample]

def download_file(url, filename):
    # Send a GET request to the URL
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Open file in binary write mode and save the content to the file
        with open(filename, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

def remove_non_ascii(s):
    return ''.join(i for i in s if ord(i)<128)

def delete_audio_files():
    files = glob.glob(os.path.join("audiobox/temp_audio_folder/", '*'))
    for file in files:
        try:
            os.remove(file)  # 파일 삭제
            # print(f"Deleted: {file}")
        except Exception as e:
            print(f"Failed to delete {file}: {e}")

def download_audio_as_array(url):
    """
    주어진 URL에서 오디오 파일을 다운로드하여 넘파이 배열과 샘플링 레이트를 반환하는 함수.
    """
    # URL에서 바이너리 데이터 가져오기
    response = requests.get(url)
    response.raise_for_status()  # 요청 실패 시 예외 발생

    # BytesIO로 감싸서 soundfile로 읽기
    data, samplerate = sf.read(io.BytesIO(response.content), always_2d=True)
    return data, samplerate

def convert_to_int16(audio_array):
    """
    오디오 배열을 int16 형식으로 변환.
    """
    # float형 오디오 배열을 int16 범위로 스케일링
    audio_array = np.clip(audio_array, -1.0, 1.0)  # -1.0 ~ 1.0 범위로 제한
    audio_int16 = (audio_array * 32767).astype(np.int16)
    return audio_int16

def process_audio_urls(url_list):
    """
    URL 리스트를 받아 int16 타입의 오디오 배열로 변환.
    반환: [n, length, channels] 형태의 3D 배열
    """
    audio_arrays = []
    samplerates = []
    max_length = 0

    # 각 URL에서 오디오 다운로드 및 변환
    for url in url_list:
        data, samplerate = download_audio_as_array(url)

        samplerates.append(samplerate)  
        data_int16 = convert_to_int16(data)
        
        # 길이 업데이트
        max_length = max(max_length, data_int16.shape[0])
        audio_arrays.append(data_int16)

    # 모든 오디오 데이터를 동일한 길이로 패딩
    padded_audios = []
    for audio in audio_arrays:
        padding = ((0, max_length - audio.shape[0]), (0, 0))  # 시간축 패딩 추가
        padded_audio = np.pad(audio, padding, mode='constant', constant_values=0)
        padded_audios.append(padded_audio)

    # [n, length, channels] 형태의 3D 배열로 병합
    result = np.stack(padded_audios, axis=0)
    return result, samplerates

#runpod handler
def handler(event):
    it=time.time()
    # handle input data
    input_data=event['input']
    texts = input_data['descriptions'][0]
    duration = input_data['duration']
    file_paths = input_data['original_download_urls']
    temperature = input_data['temperature']
    
    # generate
    if file_paths == None:
        output_audios = infer.generate([texts] * 5, duration)
    else:
        merged_audios, sr = process_audio_urls(file_paths)
        # print(merged_audios[0])
        # model inference
        output_audios = infer.variation(merged_audios, [texts] * 5, duration, temperature, sr)
    mt = time.time()
    
    #upload them
    output_urls = [None for _ in range(len(output_audios))]
    with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
        upload_futures = {executor.submit(upload_audio, audio): idx for idx, audio in enumerate(output_audios)}
        for future in concurrent.futures.as_completed(upload_futures):
            idx = upload_futures[future]
            try:
                res = future.result()
                output_urls[idx] = res
            except Exception as error:
                print('error:', error)
                
    # delete audios saved in the folder
    delete_audio_files()
    
    #prepare the API response.
    response_data={
        "output_download_urls": output_urls
    }
    ft=time.time()
    print("until generation time: ", mt-it)
    print("total time: ",ft-it)
    return json.dumps(response_data)

# prepare model
infer = Infer(Path("./new-stage-2.ckpt"))

voco type  :  oobleck


In [8]:
from safetensors.torch import save_file

save_file(infer.model.audiobox.state_dict(), "audiobox0404.safetensors")  # ← 진짜 safetensors 포맷

In [None]:
from einops import rearrange
from audiotools import AudioSignal
import time
import torchaudio

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 10.0, 3.0)
print(time.time() - st)

In [None]:
from einops import rearrange
from audiotools import AudioSignal
import time
import torchaudio

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 3.0, 3.0)
print(time.time() - st)

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 10.0, 3.0)
print(time.time() - st)

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 20.0, 3.0)
print(time.time() - st)

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 30.0, 3.0)
print(time.time() - st)


st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 45.0, 3.0)
print(time.time() - st)

st = time.time()
output_audios = infer.generate([
    "samurai man short and impactful shouting"
]*5, 60.0, 3.0)
print(time.time() - st)

# for oa in output_audios:
#     audio = rearrange(oa, "n c -> c n")
#     AudioSignal(audio, sample_rate=44100).widget()

In [None]:
output, sr = process_audio_urls(["https://hpxjdveijpuehyuykkos.supabase.co/storage/v1/object/public/user_uploads/2025-04-01/788d9cf9-e013-4b39-8034-edae36b268f9/c106633b-9878-4855-8179-ef8445d5e6c1_epidemic-audios_misc-musical_Bl8QyruCWW.wav"])
output.shape

In [None]:
st = time.time()
output_audios = infer.generate([
    "Sound of scifi weapon charging and shooting."
]*5, 3.0, 3.0) # bs, 44100*duration, stereo
print(time.time() - st)

In [None]:
AudioSignal(rearrange(output_audios[1], 't s -> s t'), sample_rate=44100).widget()

In [None]:
from msclap import CLAP

# Load model (Choose version 'clapcap')
clap_model = CLAP(version = 'clapcap', use_cuda=False)

In [None]:
captions = clap_model.generate_caption(["/workspace/2f613a75-02b1-43b7-b2ff-e715e62ecbdf.wav"])

In [None]:
captions

In [None]:
from einops import rearrange
from audiotools import AudioSignal

audio, sr = librosa.load("/workspace/2f613a75-02b1-43b7-b2ff-e715e62ecbdf.wav", sr=44100, mono=False)
print(audio.shape)
merged_audios = convert_to_int16(np.array(rearrange(audio[0], 't -> () t ()')))

duration = 1.5
temperature = 0.5
texts = 'A gun is being fired several times.'
AudioSignal(audio, sample_rate=44100).widget()

for _ in range(3):
    gen = infer.variation(merged_audios, [texts] * 1, duration, temperature, [sr], cfg_score=3.0)
    AudioSignal(rearrange(gen[0], 't s -> s t'), sample_rate=44100).widget()

print('\n=======\n')
for _ in range(3):
    gen = infer.variation(merged_audios, [''] * 1, duration, temperature, [sr], cfg_score=0.0)
    AudioSignal(rearrange(gen[0], 't s -> s t'), sample_rate=44100).widget()

In [None]:
from audiotools import AudioSignal
from einops import rearrange

voice_sets = [
    {
        "ap": "./voice_samples/wtf2.wav",
        'prompt': "Sound of dog barking.",
        'duration': 3.5
    },
    {
        "ap": "./voice_samples/charging.m4a",
        'prompt': "Sound of scifi weapon charging and shooting.",
        'duration': 4.4
    },
    {
        "ap": "./voice_samples/wings.m4a",
        'prompt': "Sound of huge eagle flapping wings.",
        'duration': 4.9
    },
]

for d in voice_sets:
    ap, prompt, duration = d["ap"], d["prompt"], d["duration"]
    audio, sr = torchaudio.load(ap)
    print(audio.shape)
    merged_audios = convert_to_int16(np.array(rearrange(audio[0], 't -> () t ()')))
    
    duration = duration
    temperature = 0.6
    texts = prompt
    
    gen = infer.variation_negative(merged_audios, [texts] * 1, ['Sound of male voice.'] * 1, duration, temperature, [sr], cfg_score=3.0, nalpha=0.0)
    gen = rearrange(gen[0], 't s -> s t')
    AudioSignal(audio, sample_rate=44100).widget()
    AudioSignal(gen, sample_rate=44100).widget()

    print(gen.shape)
    merged_audios = convert_to_int16(np.array(rearrange(gen[0], 't -> () t ()')))
    print(merged_audios.shape)
    gen = infer.variation_negative(merged_audios, [texts] * 1, ['Sound of male voice.'] * 1, duration, temperature, [sr], cfg_score=3.0, nalpha=0.0)
    gen = rearrange(gen[0], 't s -> s t')
    AudioSignal(gen, sample_rate=44100).widget()

    merged_audios = convert_to_int16(np.array(rearrange(gen[0], 't -> () t ()')))
    print(merged_audios.shape)
    gen = infer.variation_negative(merged_audios, [texts] * 1, ['Sound of male voice.'] * 1, duration, temperature, [sr], cfg_score=3.0, nalpha=0.0)
    gen = rearrange(gen[0], 't s -> s t')
    AudioSignal(gen, sample_rate=44100).widget()
    print("\n\n---\n\n")

In [None]:
from einops import rearrange
from audiotools import AudioSignal
import time

for prompt in prompts[20: 50]:
    genp = [prompt]*8
    if prompt[-1] == ".":
        genp[1] = genp[1].strip(".")
        genp[2] = genp[2].strip(".")
    else:
        genp[1] = genp[1] + "."
        genp[2] = genp[2] + "."
    genp[3] = genp[3].lower()
    genp[4] = genp[4].lower()

    genp[5] = genp[5][0].upper() + genp[5][1:]
    genp[6] = genp[6][0].upper() + genp[6][1:]
    
    output_audios = infer.generate(genp, 3.0, 3.0)
    oa = torch.tensor(output_audios)
    oa = rearrange(oa, "b n c -> b c n")
    for idx, audio in enumerate(oa):
        torchaudio.save(f'./outputs/{prompt}_{idx}.wav', audio, sample_rate=44100)
    print("\n\n----\n\n")

In [None]:
from msclap import CLAP
import torch
# pip install git+https://github.com/microsoft/CLAP.git
clap = CLAP(version="2023", use_cuda=torch.cuda.is_available())

In [None]:
@torch.no_grad()
@torch.autocast(device_type="cuda", enabled=False)
def clap_rank(audios: Tensor, texts: list[str]) -> Tensor:
    audios = audios[:, 0, :44100*10]
    print(audios.shape)
    audios = audios.float()
    text_embed = clap.get_text_embeddings(texts)
    audio_embed = clap.clap.audio_encoder(audios)[0]

    similarity = F.cosine_similarity(text_embed, audio_embed)
    args = torch.argsort(similarity, dim=0, descending=False)
    return args, similarity

In [None]:
from einops import rearrange
from audiotools import AudioSignal
import time

for prompt in prompts[40:50]:
    genp = [prompt]*8
    if prompt[-1] == ".":
        genp[1] = genp[1].strip(".")
        genp[2] = genp[2].strip(".")
    else:
        genp[1] = genp[1] + "."
        genp[2] = genp[2] + "."
    genp[3] = genp[3].lower()
    genp[4] = genp[4].lower()

    genp[5] = genp[5][0].upper() + genp[5][1:]
    genp[6] = genp[6][0].upper() + genp[6][1:]
    
    output_audios = infer.generate(genp, 3.0, 3.0)
    
    oa = torch.tensor(output_audios)
    oa = rearrange(oa, "b n c -> b c n")
    
    args, sims = clap_rank(oa.to('cuda'), [prompt]*8)
    print("prompt : ", prompt)
    print("All similarities : ", sims)
    
    for idx in [0, 1, 6, 7]:
        print(f"idx - {idx}, similarity : ", sims[args[idx]].item())
        AudioSignal(oa[args[idx]], sample_rate=44100).widget()
    print("\n\n----\n\n")

In [None]:
import numpy as np
import librosa
import torch
import CLAP.src.laion_clap as laion_clap

def int16_to_float32(x):
    return (x / 32767.0).astype(np.float32)

def float32_to_int16(x):
    x = np.clip(x, a_min=-1., a_max=1.)
    return (x * 32767.).astype(np.int16)

# model = laion_clap.CLAP_Module(enable_fusion=False)
# model.to('cuda')
# model.load_ckpt(model_id=1) # best model_id depends on what kind of data we use. 1 is good for short sound (not music, speech)

clmodel = laion_clap.CLAP_Module(enable_fusion=False, amodel= 'HTSAT-base', device='cuda')

In [None]:
# 모델 정의
state_dict = torch.load("/workspace/clap_final_0520_augmentation_epoch_best_29.pth")

# 불필요한 prefix 제거
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith("_orig_mod."):
        new_key = k.replace("_orig_mod.", "")
    else:
        new_key = k
    new_state_dict[new_key] = v

# 모델에 로드
clmodel.load_state_dict(new_state_dict)
clmodel.eval()
print("_")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import torchaudio
from tqdm import tqdm
from audiotools import AudioSignal

def cosine_sim(vector1, vector2):
    similarity = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))
    return similarity[0, 0]

for prompt in prompts[30:50]:
    text_embed = clmodel.get_text_embedding(
        [prompt]
    )
    
    all_embeds = []
    for idx in range(8):
        all_embeds.append(torchaudio.load(f'/workspace/alignment-v3/audiobox/outputs/{prompt}_{idx}.wav')[0][0])

    audios = []
    for idx, audio_array in tqdm(enumerate(all_embeds)):
        audio_embed = clmodel.get_audio_embedding_from_data(x = [np.array(audio_array)])
        cod = cosine_sim(audio_embed, text_embed[0])
        audios.append([audio_array, cod])
    # 높으면 좋음.
    print("prompt : ", prompt)
    print("total scores - ", [a[1] for a in audios])
    for ch in [7, 6, 1, 0]:
        print("score - ", sorted(audios, key=lambda x: x[1])[ch][1])
        AudioSignal(sorted(audios, key=lambda x: x[1])[ch][0], sample_rate=44100).widget()
    print("\n\n---\n\n")

In [None]:
output_audios = infer.generate(["a dog barking"] * 5, 2.0)

# original numpy==2.2.3

In [None]:
from einops import rearrange
from audiotools import AudioSignal

print(output_audios[0].shape)

audio = rearrange(output_audios[0], "n c -> c n")

AudioSignal(audio, sample_rate=44100).widget()

In [None]:
corrupt_tensor = 0.7

print(torch.linspace(1-corrupt_tensor, 1, 64))
print("\n\n")
print(torch.linspace(0, corrupt_tensor, 64) + 1 - corrupt_tensor)

In [None]:
prompts = [
    'Chirping sounds of a young bird, expressing its early calls.',
     'The sound of keys jingling softly.',
     'Sailors cheer and shout encouragement to one another during a game.',
     "The British LNER Steam Locomotive 'Flying Scotsman' whistles and gradually leaves a train station, pulling its coaches.",
     'At first the sound is calm, then Sharp electric surge on impact fast current discharge',
     'quiet calm man gagged by kidnapper',
     'Sounds of machinery and robotic movements in a futuristic facility.',
     'Sounds of a tight grip on a wrist with accompanying fabric rustling and a subtle gripping noise.',
     'The sound of an epic fiery flame strike exploding on the ground in a 2d strategy cartoony game',
     'quiet calm man gagged by kidnapper',
     'Sound of an explosion echoing in the distance.',
     'Cinematic soundtrack reflecting a calm revelation in the movie.',
     'the sound of a rogue fading into stealth mode in a 2d strategy cartoony game, smooth, shadow',
     'Various cat sounds.',
     'Whoosh of a sharp and thick sword slicing through the air, with magical spell',
     'The sound of the last drop of milk cream being squeezed out of the box',
     'Electrical ANOMALY smooth  passing impulse fly on circle',
     'Nervous breathing of a man accompanied by visible bodily tremors.',
     'At first the sound is calm, then Sharp electric surge on impact fast current discharge',
     'Sounds from the Large Hadron Collider, capturing the essence of a futuristic game.',
     '16-bit explosion sound effect suitable for gaming.',
     'Realistic sound of a cardboard box softly thudding onto a surface, accompanied by subtle rustling and a dry, papery resonance.',
     'Sound of a spaceship door opening.',
     'Loud blast of a single-tone British diesel horn playing an E flat note.',
     'Sound of the Void Eye shattering: a mix of breaking glass and releasing energy.',
     'Subtle clicking sound of a wheel, designed for user interface interactions.',
     'Sound of a bomb being thrown, creating a sizzling effect.',
     'Sound of a magical portal opening, suitable for a game setting.',
     'Sounds of liquid flow and bleeding effects.',
     'The activation of an old analog switch produces a sharp mechanical click, followed by a high-pitched electronic beep, as a CRT screen flickers to life with soft static and a faint digital hum.',
     'Subtle metallic click sounds from a quiet menu button in a user interface.',
     'one',
     'underwater word game ui button click',
     'Sci-fi sound of a clunky laptop being flipped open.',
     'The sound of leaves rustling in the wind.',
     'Sounds of a mechanical device: gears and pins rotating and unlocking smoothly.',
     'Sound of paper rustling as notes are read, suitable for a game setting.',
     'The sound of a door closing vigorously with a loud bang',
     'Heavy metal pin dragging and scraping sounds.',
     'Horror soundscape of larvae emerging from the ground. A deep sound from beneath the ground',
     'Sound of a rhino in its natural habitat, highlighting its movements and presence.',
     'British Diesel Locomotive Class 47 idling sounds.',
     'Sound of a heavy rock being tossed.',
     'light interface denied sound'
]

In [None]:
infer.model.max_audio_len

In [None]:
import torch
ckpt = torch.load("new-stage-2.ckpt", map_location="cpu")

print(ckpt.keys())
# 출력 예시: dict_keys(['state_dict', 'hyper_parameters', 'optimizer_states', 'lr_schedulers', ...])

print(ckpt['hyper_parameters'])  # config 내용이 여기 있을 수 있음

In [None]:
def download_audio_as_array(url):
    """
    주어진 URL에서 오디오 파일을 다운로드하여 넘파이 배열과 샘플링 레이트를 반환하는 함수.
    """
    # URL에서 바이너리 데이터 가져오기
    response = requests.get(url)
    response.raise_for_status()  # 요청 실패 시 예외 발생

    # BytesIO로 감싸서 soundfile로 읽기
    data, samplerate = sf.read(io.BytesIO(response.content), always_2d=True)
    return data, samplerate

In [None]:
download_audio_as_array()

In [None]:
# url = "https://hpxjdveijpuehyuykkos.supabase.co/storage/v1/object/public/user_uploads/2025-03-20/28c56a23-6c85-4b08-937a-7f8ef074abce/CC-DS%20Body%20Fall%20Concrete%20Soft%2002-glued.wav"
url = 'https://hpxjdveijpuehyuykkos.supabase.co/storage/v1/object/public/user_uploads/2025-03-31/788d9cf9-e013-4b39-8034-edae36b268f9/9e7d33cd-e33c-4e57-a81c-624cec6b4da3_pond_pond_0_low-8-bit-lo-fi-sound-effect-218319919_nw_prev.wav'
url = "https://hpxjdveijpuehyuykkos.supabase.co/storage/v1/object/public/user_uploads/2025-04-01/788d9cf9-e013-4b39-8034-edae36b268f9/c106633b-9878-4855-8179-ef8445d5e6c1_epidemic-audios_misc-musical_Bl8QyruCWW.wav"

response = requests.get(url)
response.raise_for_status()  # 요청 실패 시 예외 발생

In [None]:
ad = io.BytesIO(response.content)

In [None]:
data, samplerate = sf.read(ad, always_2d=True)

In [None]:
def process_audio_urls(url_list):
    """
    URL 리스트를 받아 int16 타입의 오디오 배열로 변환.
    반환: [n, length, channels] 형태의 3D 배열
    """
    audio_arrays = []
    samplerates = []
    max_length = 0

    # 각 URL에서 오디오 다운로드 및 변환
    for url in url_list:
        data, samplerate = download_audio_as_array(url)

        samplerates.append(samplerate)  
        data_int16 = convert_to_int16(data)
        
        # 길이 업데이트
        max_length = max(max_length, data_int16.shape[0])
        audio_arrays.append(data_int16)

    # 모든 오디오 데이터를 동일한 길이로 패딩
    padded_audios = []
    for audio in audio_arrays:
        padding = ((0, max_length - audio.shape[0]), (0, 0))  # 시간축 패딩 추가
        padded_audio = np.pad(audio, padding, mode='constant', constant_values=0)
        padded_audios.append(padded_audio)

    # [n, length, channels] 형태의 3D 배열로 병합
    result = np.stack(padded_audios, axis=0)
    return result, samplerates

In [None]:
outputs = process_audio_urls([url, url])

In [None]:
outputs[0].shape

In [None]:
audios = outputs[0]

In [None]:
outputs[1]

In [None]:
audios = [audio / np.iinfo(audio.dtype).max for audio in audios]
audio_tensor = torch.from_numpy(np.stack(audios, axis=0)).to('cuda')
audio_tensor = audio_tensor.float()
##
audio_tensor = audio_tensor.transpose(1, 2)
audio_tensor = torchaudio.functional.resample(audio_tensor.contiguous(), orig_freq=96000, new_freq=44100)
audio_tensor = audio_tensor.transpose(1, 2)
if audio_tensor.shape[2] == 1:
    audio_tensor = audio_tensor.repeat(1, 1, 2)
elif audio_tensor.shape[2] > 2:
    audio_tensor = audio_tensor[:, :, :2]
target_len = audio_tensor.shape[1]