In [1]:
import os
import json
import math
import sys
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

import wandb

from accelerate import Accelerator

from config import Config
from audiomodel import AudioProcessing
from audiodataset import AudioDataset, TestDataset

def make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def build_model(cfg):
        from audiocraft.models.loaders import load_compression_model, load_lm_model
        """Instantiate models and optimizer."""     
        compression_model = load_compression_model('facebook/audiogen-medium', device=cfg.device)
        lm = load_lm_model('facebook/audiogen-medium', device=cfg.device)
        return compression_model, lm
    
def main(prompts=["beep"], n=5):

    cfg = Config()
    cfg.update(**{"prompts": [p for p in prompts for _ in range(n)]})
    
    accelerator = Accelerator(gradient_accumulation_steps=cfg.gradient_accumulation_steps)
    save_path = "./test"
    make_dir(save_path)
    cfg.update(**{"save_path": save_path})
    
    compression_model, lm = build_model(cfg)
    model = AudioProcessing(cfg, lm)
    
    test_dataset = TestDataset(cfg)
    test_dataloader = DataLoader(test_dataset, batch_size=1)

    model, compression_model = accelerator.prepare(model, compression_model)
    model_path = os.path.join(cfg.output_dir, "best.pth")
    model.load_state_dict(torch.load(model_path))
    
    model.eval()
    compression_model.eval()
    if accelerator.is_main_process:         
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_vae = accelerator.unwrap_model(compression_model)
        audio_num = 1
        for test_step, batch in enumerate(test_dataloader):
            gen_tokens, gen_audio = unwrapped_model.inference(batch, unwrapped_vae)
            prompt = batch[0]
            audio_filename = f"{prompt}_{audio_num}.wav"
            unwrapped_model.save_audio(gen_audio, audio_filename, cfg)
            from IPython.display import Audio
            display(Audio(data=gen_audio[0].detach().cpu().numpy(), rate=cfg.sample_rate))
            audio_num += 1
            if audio_num > 5:
                audio_num = 1

    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0+cu118)
    Python  3.10.13 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [10]:
import pandas as pd

# Load the CSV file
file_path = 'csv_files/sound_effect_prompts.csv'
df = pd.read_csv(file_path)

# Extract the 'prompt' column
prompts = df['Prompt'].tolist()[:10]
#prompts = [f'{row["Prompt"]} {row["Category"]}' for index, row in df.iterrows()][:10]

In [11]:
prompts

['The sound of a fast object passing by, creating a sharp whoosh.',
 'The whoosh sound of a sword swinging rapidly through the air.',
 'The whoosh sound of a whip cracking sharply in the open air.',
 'The whoosh sound of a strong gust of wind whooshing through trees.',
 'The whoosh sound of a race car speeding past on a track.',
 'The whoosh sound of a bullet rapidly passing by overhead.',
 'The whoosh sound of a paper airplane gliding smoothly in the air.',
 'The whoosh sound of a boomerang spinning and returning.',
 'The whoosh sound of an arrow swiftly flying towards a target.',
 'The whoosh sound of a superhero flying at high speed.']

In [12]:
from accelerate import notebook_launcher
args = (prompts, 2, )
notebook_launcher(main, args, num_processes=1)

Launching training on one GPU.




In [2]:
from IPython.display import Audio
Audio("./test/the sound of whoosh_1.wav")

In [7]:
from audiotools import AudioSignal
signal = AudioSignal("./generated_audios_finetune2/epoch_6_4.wav")
signal.cpu().detach().write("test.wav")

<audiotools.core.audio_signal.AudioSignal at 0x7fab36b55c90>

In [10]:
Audio("test.wav")

In [16]:
Audio("./generated_audios_finetune2/epoch_6_4.wav")

In [1]:
import librosa
import soundfile as sf

def normalize_and_save_audio(input_path, output_path):
    # 오디오 파일 불러오기
    audio, sr = librosa.load(input_path, sr=None)  # 'sr=None'은 원본 샘플링 레이트를 유지

    # 오디오 정규화: 최대 절대값으로 나누기
    audio_normalized = audio / max(abs(audio))

    # 정규화된 오디오 저장
    sf.write(output_path, audio_normalized, sr)

# 사용 예
input_audio_path = "./test/the sound of whoosh_1.wav"  # 원본 오디오 파일 경로
output_audio_path = "./test/the sound of whoosh_1_normalized.wav"  # 정규화된 오디오 저장 경로
normalize_and_save_audio(input_audio_path, output_audio_path)