# Import

In [None]:
from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio
import torch
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import glob, os, sys
import pandas as pd
import numpy as np
from IPython.display import Audio

Well, according to https://huggingface.co/facebook/musicgen-style, the weights are Non-commercial:
"License: Code is released under MIT, model weights are released under CC-BY-NC 4.0."


https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN_STYLE.md

In [None]:
model = MusicGen.get_pretrained('facebook/musicgen-style')

In [None]:
model.set_generation_params(
    duration=30, # generate 8 seconds, can go up to 30
    use_sampling=True, 
    top_k=250,
    cfg_coef=3., # Classifier Free Guidance coefficient 
    cfg_coef_beta=6,
    # cfg_coef=3., # Classifier Free Guidance coefficient 
    # cfg_coef_beta=6., # double CFG is necessary for text-and-style conditioning
    #                # Beta in the double CFG formula. between 1 and 9. When set to 1 it is equivalent to normal CFG. 
    #                # When we increase this parameter, the text condition is pushed. See the bottom of https://musicgenstyle.github.io/ 
    #                # to better understand the effects of the double CFG coefficients. 
)


model.set_style_conditioner_params(
    eval_q=1, # integer between 1 and 6
              # eval_q is the level of quantization that passes
              # through the conditioner. 
              # When low, the models adheres less to the 
              # audio conditioning
    excerpt_length=4.5,
    # excerpt_length=3., # the length in seconds that is taken by the model in the provided excerpt, can be                 
                       # between 1.5 and 4.5 seconds but it has to be shortest to the length of the provided conditioning
    )

In [None]:
ASH_AUDIO_PATH = "/mnt/disks/audio-ai-research-speech-data/avi_ash_PROCESSED"
all_ash_paths = glob.glob(os.path.join(ASH_AUDIO_PATH, '*'))
# ASH_TEST_AUDIO_PATH = os.path.join(ASH_AUDIO_PATH, "15_EDOM__005.wav") # This one is just too chaotic?
ASH_TEST_AUDIO_PATH = os.path.join(ASH_AUDIO_PATH, "12_Always On The Run__003.wav")
melody, sr = torchaudio.load(ASH_TEST_AUDIO_PATH)

In [None]:
# ?model.generate_with_chroma
START_TIME = 37
END_TIME = 43
start_index, end_index = START_TIME * sr, END_TIME * sr
print(melody.shape)
melody = melody[:, start_index:end_index]

In [None]:
descriptions = [
    'Instrumental, lofi beats',
    'Instrumental, classical, piano, instrumental, violin',
    "Instrumental, edm, synth, bass, euphoric, energetic",
    # 'Instrumental'
    # "hardcore metal riffs",
]
wav = model.generate_with_chroma(
    descriptions=descriptions, 
    melody_wavs=melody[None].expand(len(descriptions), -1, -1),
    melody_sample_rate=sr,
    progress = True,
    return_tokens = False,
)  # generates len(descriptions) samples.

for idx, one_wav in enumerate(wav):
    # Will save under {idx}.wav, with loudness normalization at -14 db LUFS.
    filename = os.path.join('./musicgen_style', f'{idx}.wav')
    audio_write(filename, one_wav.cpu(), model.sample_rate, strategy="loudness", loudness_compressor=True)

In [None]:
melody[None].expand(len(descriptions), -1, -1).shape

In [None]:
# display(Audio(ASH_TEST_AUDIO_PATH))
display(Audio(data=melody, rate=sr))
for _audio_fp, _description in zip(glob.glob('./musicgen_style/*'), np.array(descriptions)[::-1]):
    print('===== ===== ===== ===== =====')
    print(_description)
    print(_audio_fp)
    display(Audio(_audio_fp))