# Demonstration on how to use MusicGen

In [None]:
# set CUDA visible device id
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
# load model
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch
# use facebook/musicgen-small if GPU memory limited
processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-medium")
model = model.cuda()

## Example Usage

In [None]:
import json

text=[]
text.append("country guitar")

# From the Stanford captioning dataset. This alone have high probability will generate terrible music.
text.append("A large building with bars on the windows in front of it. \
            There is people walking in front of the building. \
            There is a street in front of the building with many cars on it.")

inputs = processor(
    text=text,
    padding=True,
    return_tensors="pt",
).to("cuda")


In [None]:
# man_new_tokens = 256/512/1024 for 5/10/20 seconds
audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)

Can either play it in ipynb or save it.

In [None]:
import IPython
sampling_rate = model.config.audio_encoder.sampling_rate

IPython.display.Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)



In [None]:
IPython.display.Audio(audio_values[1].cpu().numpy(), rate=sampling_rate)

## Generate on dataset

In [None]:
import json
# Change the file_path generated from InstructBLIP. 
file_path = '../../InstructBLIP_demo.json'

with open(file_path, 'r') as file:
    data = json.load(file)

text_list = []
id_list = []

for item in data:
    text_string = f"{item['time']}, {item['music-era']}, {item['emotion']}, {item['music-style']}. {item['paragraph']}"
    print(text_string)
    text_list.append(text_string)
    id_list.append(item['url'])
    break


In [None]:
from tqdm import tqdm
n = len(text_list)
batch_size = 2  # Set your desired batch size
audio_values = []

for i in tqdm(range(0, n, batch_size)):
    batch_texts = text_list[i:i+batch_size]
    inputs = processor(
        text=batch_texts,
        padding=True,
        return_tensors="pt",
    ).to("cuda")

    batch_audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=512)
    audio_values.extend(batch_audio_values)


In [None]:
# Save the audio files
import scipy, os
output_dir = "./audio_output"
os.makedirs(output_dir, exist_ok=True)
sampling_rate = model.config.audio_encoder.sampling_rate
for i in range(len(audio_values)):
    scipy.io.wavfile.write(f"{output_dir}/{id_list[i]}.wav", rate=sampling_rate, data=audio_values[i][0].cpu().numpy())