In [2]:
import os
import json
import math
import sys
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from accelerate import Accelerator

from config import Config
from audiomodel import AudioProcessing
from audiodataset_seperation import SeperationDataset

def make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def build_model(cfg):
        from audiocraft.models.loaders import load_compression_model, load_lm_model
        """Instantiate models and optimizer."""     
        compression_model = load_compression_model('facebook/audiogen-medium', device=cfg.device)
        lm = load_lm_model('facebook/audiogen-medium', device=cfg.device)
        return compression_model, lm

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.3.0.dev20240125+cu118)
    Python  3.10.13 (you have 3.10.12)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
cfg = Config()

model_path = "./output_dir_finetune/best.pth"

base_path = "./csv_files/"
train_data_path = f"{base_path}/train_dataset_epidemic_sub.csv"
eval_data_path = f"{base_path}/eval_dataset_epidemic_sub.csv"

cfg.update(train_data_path=train_data_path, eval_data_path=eval_data_path, batch_size=4)

save_path = "./test"
make_dir(save_path)

cfg.update(**{"save_path": save_path})


eval_dataset = SeperationDataset(cfg, train=False)
eval_dataloader = DataLoader(eval_dataset, batch_size=1, shuffle=False, num_workers=8)

compression_model, lm = build_model(cfg)
model = AudioProcessing(cfg, lm)

model.load_state_dict(torch.load(model_path))

torch.cuda.empty_cache()

num_params = sum(p.numel() for p in lm.condition_provider.conditioners.description.parameters() if p.requires_grad)
print("params : ", num_params)



params :  1574400


In [4]:
prompts = ["The sound of cow roaring"]

for _ in range(5):
    gen_tokens, gen_audio = model.inference(prompts, compression_model)
    
    from IPython.display import Audio
    
    # 합쳐진거, GT, 생성된 소리 다 비교
    display(Audio(data=gen_audio[0].cpu().numpy(), rate=16000))

In [9]:
num_params = sum(p.numel() for p in lm.transformer.parameters() if p.requires_grad)
print(num_params)


1812381696


In [7]:
prompts = ["The sound of cow roaring"]

for _ in range(5):
    gen_tokens, gen_audio = model.inference(prompts, compression_model)
    
    from IPython.display import Audio
    
    # 합쳐진거, GT, 생성된 소리 다 비교
    display(Audio(data=gen_audio[0].cpu().numpy(), rate=16000))