In [1]:
!pip install -q -U audiocraft wandb

Collecting audiocraft
  Downloading audiocraft-0.0.2.tar.gz (423 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.9/423.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.15.9-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting av (from audiocraft)
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops (from audiocraft)
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashy>=0.0.1 (from audiocraft)
  Downloading flashy-0.0.2.tar.gz (72 kB)
[2K 

In [1]:
import os
import random
from tempfile import TemporaryDirectory

import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

import wandb
from tqdm.auto import tqdm

In [2]:
# @title ## MusicGen Configs

# @markdown WandB Project Name
project_name = "audiocraft" # @param {type:"string"}

wandb.init(project=project_name, job_type="musicgen/inference")

config = wandb.config

# @markdown Select the MusicGen variant
config.model_name = "small" # @param ["small", "medium", "large", "melody"]

# @markdown ## Generation Parameters
# @markdown Use sampling if True, else do argmax decoding
config.use_sampling = True # @param {type:"boolean"}

# @markdown `top_k` used for sampling; limits us to `k` number of  of the top tokens to consider.
config.top_k = 250 # @param {type:"slider", min:0, max:1000, step:1}

# @markdown `top_p` used for sampling; limits us to the top tokens within a probability mass `p`
config.top_p = 0.0 # @param {type:"slider", min:0, max:1.0, step:0.01}

# @markdown Softmax temperature parameter
config.temperature = 1.0 # @param {type:"slider", min:0, max:1.0, step:0.01}

# @markdown Duration of the generated waveform
config.duration = 30 # @param {type:"slider", min:1, max:30, step:1}

# @markdown Coefficient used for classifier free guidance
config.cfg_coef = 3 # @param {type:"slider", min:1, max:100, step:1}

# @markdown Whether to perform 2 forward for Classifier Free Guidance instead of batching together the two. This has some impact on how things are padded but seems to have little impact in practice.
config.two_step_cfg = False # @param {type:"boolean"}

# @markdown When doing extended generation (i.e. more than 30 seconds), by how much should we extend the audio each time. Larger values will mean less context is preserved, and shorter value will require extra computations.
config.extend_stride = 18 # @param {type:"slider", min:1, max:30, step:1}

# @markdown ---
# @markdown ## Conditional Generation Configs

# @markdown The prompt for generating audio. You can give multiple prompts separated by `|` in the input. You can also leave it blank for unconditional generation.
config.prompts = "happy rock | energetic EDM | sad jazz" # @param {type:"string"}

descriptions = [prompt.strip() for prompt in config.prompts.split("|")]
config.is_unconditional = config.prompts.strip() == ""

# @markdown Number of audio samples generated, this is relevant only for unconditional generation, i.e, if `config.prompts` is left blank.
config.num_samples = 4 # @param {type:"slider", min:1, max:10, step:1}

# @markdown Specify the random seed
seed = None # @param {type:"raw"}

max_seed = int(1024 * 1024 * 1024)
if not isinstance(seed, int):
    seed = random.randint(1, max_seed)
if seed < 0:
    seed = - seed
seed = seed % max_seed
config.seed = seed

[34m[1mwandb[0m: Currently logged in as: [33mgeekyrakshit[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
model = MusicGen.get_pretrained(config.model_name)
model.set_generation_params(
    use_sampling=config.use_sampling,
    top_k=config.top_k,
    top_p=config.top_p,
    temperature=config.temperature,
    duration=config.duration,
    cfg_coef=config.cfg_coef,
    two_step_cfg=config.two_step_cfg,
    extend_stride=config.extend_stride
)

generated_wav = None
if config.is_unconditional:
    generated_wav = model.generate_unconditional(
        num_samples=config.num_samples, progress=True
    )
else:
    generated_wav = model.generate(descriptions, progress=True)



In [4]:
temp_dir = TemporaryDirectory()
wandb_table = wandb.Table(columns=["Prompt", "Audio", "Seed"])

for idx, wav in enumerate(generated_wav):
    file_name = os.path.join(temp_dir.name, str(idx))
    audio_write(
        file_name,
        wav.cpu(),
        model.sample_rate,
        strategy="loudness",
        loudness_compressor=True,
    )
    wandb_audio = wandb.Audio(file_name +  ".wav")
    wandb.log({"Generated-Audio": wandb_audio})
    if len(descriptions) > 1:
        wandb_table.add_data(descriptions[idx], wandb_audio, config.seed)
    else:
        wandb_table.add_data(config.prompts, wandb_audio, config.seed)

wandb.log({"Generated-Audio-Table": wandb_table})

wandb.finish()
temp_dir.cleanup()

CLIPPING /tmp/tmpoxdv7uq6/0 happening with proba (a bit of clipping is okay): 0.0012854166561737657 maximum scale:  1.5367754697799683
CLIPPING /tmp/tmpoxdv7uq6/1 happening with proba (a bit of clipping is okay): 0.00140729162376374 maximum scale:  1.9805430173873901
CLIPPING /tmp/tmpoxdv7uq6/2 happening with proba (a bit of clipping is okay): 0.0027749999426305294 maximum scale:  1.5332883596420288
