In [None]:
import os
import json
import math
import sys
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from config import Config
from audiomodel import AudioProcessing
from audiotools import AudioSignal

def make_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

def build_model(cfg):
    from audiocraft.models.loaders import load_compression_model, load_lm_model
    """Instantiate models and optimizer."""     
    compression_model = load_compression_model('facebook/audiogen-medium', device=cfg.device)
    lm = load_lm_model('facebook/audiogen-medium', device=cfg.device)
    return compression_model, lm

In [None]:
cfg=Config()
print(cfg.sample_rate)
compression_model, lm = build_model(cfg)
compression_model.eval()
model = AudioProcessing(cfg, lm)
model.load_state_dict(torch.load("./weight/best.pth"), strict=False)
model.eval()
print("load")

In [None]:
from audiotools import AudioSignal
from IPython.display import Audio

for _ in range(2):
    tok, gen_audio = model.inference(["(gun)0.6 is (reloaded)1.5"], compression_model, duration=3.0)
    display(Audio(gen_audio[0][0].cpu().detach(), rate=cfg.sample_rate))
    
for _ in range(2):
    tok, gen_audio = model.inference(["gun is reloaded"], compression_model, duration=3.0)
    display(Audio(gen_audio[0][0].cpu().detach(), rate=cfg.sample_rate))

# [4, 5, 1536]

In [None]:
from audiotools import AudioSignal
from IPython.display import Audio

for _ in range(10):
    print("작은 가중치")
    tok, gen_audio = model.inference(["metal impact with (reverb, echo)0.3"], compression_model, duration=3.0)
    # display(Audio(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate))
    AudioSignal(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate).widget()

for _ in range(10):
    print("그냥 가중치")
    tok, gen_audio = model.inference(["metal impact with (reverb, echo)1.5"], compression_model, duration=3.0)
    # display(Audio(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate))
    AudioSignal(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate).widget()

for _ in range(10):
    print("없는")
    tok, gen_audio = model.inference(["metal impact"], compression_model, duration=3.0)
    # display(Audio(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate))
    AudioSignal(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate).widget()

for _ in range(5):
    tok, gen_audio = model.inference(["metal impact with reverb, echo"], compression_model, duration=3.0)
    # display(Audio(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate))
    AudioSignal(gen_audio[0][0].cpu().detach(), sample_rate=cfg.sample_rate).widget()

# [4, 5, 1536]

In [None]:
for _ in range(2):
    tok, gen_audio = model.inference(["The sound of dog and cat"], compression_model, duration=6.0)
    display(Audio(gen_audio[0][0].cpu().detach(), rate=cfg.sample_rate))

In [None]:
for _ in range(2):
    tok, gen_audio = model.inference(["The sound of reloading a gun"], compression_model, duration=3.0)
    display(Audio(gen_audio[0][0].cpu().detach(), rate=cfg.sample_rate))