### 加载模型

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

model_name = "facebook/musicgen-medium"  # 可选：small, medium, large
# 初次使用记得去掉local_files_only=True
processor = AutoProcessor.from_pretrained(model_name, local_files_only=True)
base_model = MusicgenForConditionalGeneration.from_pretrained(model_name, local_files_only=True).half().to(device)
# model.half()解决精度问题报错

from peft import PeftModel

lora_path = "./outputs/musicgen-lora/checkpoint-1600"
# lora_path = "./outputs/musicgen-lora/initial_lora"
model = PeftModel.from_pretrained(base_model, lora_path)

  from .autonotebook import tqdm as notebook_tqdm


device: cuda


### 生成

In [2]:
inputs = processor(
    text=[
        # "80s pop track with bassy drums and synth", 
        # "90s rock song with loud guitars and heavy drums", 
        "This slow jazz song features male voices singing the main melody in harmony. This is accompanied by percussion playing a simple beat. The double bass plays the root notes of the chords. Trumpets play a fill in harmony in between lines. A piano plays an arpeggiated chord at the end of the first line. This song has a romantic mood. This song can be played in a classic romantic movie."],
    padding=True,
    return_tensors="pt",
).to(device)

inputs

{'input_ids': tensor([[  100,  2684,  9948,  2324,   753,  5069, 13256,  8782,     8,   711,
         27832,    16, 18362,     5,   100,    19,     3, 10102,    57,     3,
         19984,  1556,     3,     9,   650,  3853,     5,    37,  1486,  7981,
          4805,     8,  5465,  3358,    13,     8, 20513,     7,     5,  2523,
            15,    17,     7,   577,     3,     9,    14,    16, 18362,    16,
           344,  2356,     5,    71,  8355,  4805,    46,  1584,   855, 15406,
           920, 20513,    44,     8,   414,    13,     8,   166,   689,     5,
           100,  2324,    65,     3,     9,  7966,  6526,     5,   100,  2324,
            54,    36,  1944,    16,     3,     9,  2431,  7966,  1974,     5,
             1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [3]:
audio_values = model.generate(
    **inputs, 
    max_new_tokens=256, 
    )
audio_values



tensor([[[0.0118, 0.0123, 0.0162,  ..., 0.0257, 0.0273, 0.0271]]],
       device='cuda:0', dtype=torch.float16)

In [4]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)

Sampling rate: 32000


### 试听

In [5]:
Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

In [6]:
Audio(audio_values[1].cpu().numpy(), rate=sampling_rate)

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
Audio(audio_values[2].cpu().numpy(), rate=sampling_rate)

### 保存到文件

In [None]:
import scipy
import numpy as np

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)
for i in range(len(audio_values)):
    scipy.io.wavfile.write(f"outputs/musicgen_out_{i}.wav", rate=sampling_rate, data=np.asarray(audio_values[0, 0].cpu(), dtype=np.float32))