In [None]:
import time
import os
import glob
import uuid
import concurrent
import concurrent.futures
import requests
import json
import datetime
import soundfile as sf
import io
from pathlib import Path
import numpy as np
import torch
from einops import repeat
from torch import Tensor
from torch.nn import functional as F
from transformers import AutoTokenizer
from vocos import get_voco
from model.module import AudioBoxModule
from torchode.interface import solve_ivp
import torchaudio
import re
from tqdm import tqdm
import pandas as pd
from einops import rearrange

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
path = "./new-stage-2.ckpt"

model = AudioBoxModule.load_from_checkpoint(path).to(device)
model.eval()
print("-")

In [None]:
voco = get_voco('oobleck').to(device)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [None]:
import torchaudio
import torch
import librosa

audio, sr = librosa.load('CC-DS Body Fall Concrete Soft 02-glued.wav', sr=44100, mono=False)

In [None]:
latent_len = voco.encode_length(44100*3)
print(latent_len)
print(voco.latent_dim)

In [None]:
text = 'body fall concrete soft'
text_output = tokenizer(
    [text + tokenizer.eos_token],
    add_special_tokens=False,
    return_tensors="pt",
    max_length=127,
    truncation="longest_first",
    padding="max_length"
)
input_ids = text_output['input_ids'].to(device)
attention_mask = text_output['attention_mask'].to(device).bool()

print("input_ids : ", input_ids.shape)
print("attention_mask : ", attention_mask.shape)

In [None]:
text_embed = model.t5(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
print(text_embed.shape)

In [None]:
from einops import rearrange

na = torch.from_numpy(audio).float().unsqueeze(dim=0)
print(na.shape)
na = rearrange(na, 'b c t -> b t c')
print(na.shape)

In [None]:
latent = voco.encode(na.to('cuda'))
latent_len = latent.shape[1]
max_latent_len = 400
latent_mask = torch.arange(max_latent_len) < latent_len
latent_mask = latent_mask.unsqueeze(dim=0)

print(latent.shape)
print(latent_mask.shape)

In [None]:
x1 = latent
x0 = torch.randn_like(latent) # randn은 정규분포, rand는 유니폼 분포
x0.shape

In [None]:
times = torch.rand((1,), dtype=latent.dtype, device=latent.device)
print(times)

time_step = rearrange(times, "b -> b () ()") # = unsqueeze 두번

In [None]:
sigma = 1e-5

xt = (1 - (1 - sigma) * time_step) * x0 + time_step * x1

In [None]:
from utils.mask import min_span_mask, prob_mask_like

span_mask = model.get_span_mask(latent_mask)
print(span_mask.shape)

cond_drop_mask = prob_mask_like((1, 1), model.drop_prob, model.device) # drop_prob 확률로 True
audio_context_mask = span_mask | cond_drop_mask

print(audio_context_mask.shape)

print(latent_mask[0][:30])
print(cond_drop_mask)
print(span_mask[0][:30])
print(audio_context_mask[0][:30])

In [None]:
audio_context = torch.where(
    rearrange(audio_context_mask[:, :latent_len], "b l -> b l ()"), 
    0, 
    x1
)
print(rearrange(audio_context_mask[:, :latent_len], "b l -> b l ()").shape)
print(audio_context.shape)

In [None]:
text_drop_mask = prob_mask_like((1,), model.drop_prob, model.device)
print(text_drop_mask)
text_emb = torch.where(
    rearrange(text_drop_mask, "b -> b () ()"), 0, text_embed
)
print(text_emb.shape)

In [None]:
print("w : ", xt.shape)
print("time_step : ", time_step.shape)
print("audio_mask : ", latent_mask.shape)
print("audio_context : ", audio_context.shape)
print("text_embed : ", text_embed.shape)
print("attention_mask : ", attention_mask.shape)

pred = model.audiobox(
    w=xt,
    times=time_step[0],
    audio_mask=latent_mask.to(model.device),
    context=audio_context,
    phoneme_emb=text_embed,
    phoneme_mask=attention_mask
)

In [None]:
target_flow = x1 - x0

print(target_flow.shape)
print(latent.shape)

In [None]:
max_latent_len = 215

padded_latent = np.pad(latent.cpu()[0], ((0, max_latent_len - latent_len), (0, 0)))
print(padded_latent.shape)

padded_latent = torch.from_numpy(padded_latent).to(model.device)
latent_mask = latent_mask.to(model.device)

In [None]:
padded_latent.shape

In [None]:
model.max_audio_len = 215

In [None]:
model(
    audio_enc=padded_latent.unsqueeze(dim=0),
    audio_mask=latent_mask[:, :215],
    phoneme=input_ids,
    phoneme_mask=attention_mask,
)

In [None]:

datas = [{
    'audio_path': '/workspace/alignment-v3/audiobox/latent.npy',
    'desc': "test for cat",
    'duration': 1.5
}]*10
pd.DataFrame(datas).to_csv("test.csv")

In [None]:
import pandas as pd
from data.dataset_0402 import AudioDataset

datasets = AudioDataset(
    dataset_path = './test.csv',
    max_audio_len = 400,
    sampling_rate = 44100,
    max_txt_len = 127,
    channel = 2
)

In [None]:
from torch.utils.data import DataLoader, random_split

dl = DataLoader(
    datasets,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    drop_last=True,
    shuffle=True,
    collate_fn=datasets.collate,
    prefetch_factor=4,
)

In [None]:
batch = next(iter(dl))

In [None]:
latent, latent_mask, text_input_ids, text_attention_mask = batch

In [None]:
model(
    audio_enc=latent.to(model.device),
    audio_mask=latent_mask.to(model.device),
    phoneme=text_input_ids.to(model.device),
    phoneme_mask=text_attention_mask.to(model.device),
)

In [None]:
import os 

afs = os.listdir('./outputs')

In [None]:
import librosa
import numpy as np
import random
import librosa
import numpy as np
from tqdm import tqdm
from audiotools import AudioSignal

def analyze_spectrum_characteristics(file_path: str):
    y, sr = librosa.load(file_path)

    # Spectral Bandwidth
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    bandwidth_mean = np.mean(bandwidth)

    # Spectral Rolloff (e.g. 95%)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.95)[0]
    rolloff_mean = np.mean(rolloff)

    # Spectral Flatness
    flatness = librosa.feature.spectral_flatness(y=y)[0]
    flatness_mean = np.mean(flatness)

    return {
        "bandwidth_mean": bandwidth_mean,
        "rolloff_mean": rolloff_mean,
        "flatness_mean": flatness_mean
    }

def is_bright_sound(file_path: str, threshold: float = 3000.0) -> bool | None:
    """
    Spectral centroid를 이용해 소리가 '밝은'지 판단합니다.
    
    Args:
        file_path (str): 오디오 파일 경로 (wav, mp3 등)
        threshold (float): 밝음 판단 기준 (Hz). 보통 3000Hz 이상이면 밝은 소리로 판단.
        
    Returns:
        True: 밝은 소리
        False: 어두운 소리
        None: 소리가 너무 짧거나 판단이 어려운 경우
    """
    try:
        y, sr = librosa.load(file_path)
        
        if len(y) < sr * 0.2:  # 0.2초보다 짧은 경우 판단 보류
            return None

        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
        centroid_mean = np.mean(spectral_centroids)

        if np.isnan(centroid_mean) or centroid_mean < 100:
            return None  # 무의미한 값이면 판단하지 않음

        return centroid_mean >= threshold

    except Exception as e:
        print(f"Error processing file: {e}")
        return None

total_bm = 0
for i in tqdm(range(20)):
    idx = random.randint(1, len(afs))
    bm = analyze_spectrum_characteristics('./outputs/' + afs[idx])['bandwidth_mean']
    total_bm += bm
    print(bm)
    AudioSignal('./outputs/' + afs[idx], duration=10.0).widget()

In [None]:
total_bm/2000