### 加载模型

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

model_name = "facebook/musicgen-medium"  # 可选：small, medium, large
# 初次使用记得去掉local_files_only=True
processor = AutoProcessor.from_pretrained(model_name, local_files_only=True)
base_model = MusicgenForConditionalGeneration.from_pretrained(model_name, local_files_only=True).half().to(device)
# model.half()解决精度问题报错

from peft import PeftModel

lora_path = "./outputs/musicgen-lora/initial_lora"
model = PeftModel.from_pretrained(base_model, lora_path)

  from .autonotebook import tqdm as notebook_tqdm


device: cuda




### 生成

In [2]:
inputs = processor(
    text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums", "a football song for world cup"],
    padding=True,
    return_tensors="pt",
).to(device)

inputs

{'input_ids': tensor([[ 2775,     7,  2783,  1463,    28,  7981,    63,  5253,     7,    11,
         13353,     1,     0],
        [ 2777,     7,  2480,  2324,    28,  8002,  5507,     7,    11,  2437,
          5253,     7,     1],
        [    3,     9,  3370,  2324,    21,   296,  4119,     1,     0,     0,
             0,     0,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], device='cuda:0')}

In [4]:
audio_values = model.generate(
    **inputs, 
    max_new_tokens=512, 
    )
audio_values

tensor([[[-0.1700, -0.1838, -0.1541,  ...,  0.0773,  0.0613,  0.0759]],

        [[ 0.0229,  0.0168,  0.0433,  ...,  0.0583,  0.0578,  0.0651]],

        [[ 0.1501,  0.1494,  0.1805,  ...,  0.0798,  0.0764,  0.0793]]],
       device='cuda:0', dtype=torch.float16)

In [5]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)

Sampling rate: 32000


### 试听

In [6]:
Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

In [7]:
Audio(audio_values[1].cpu().numpy(), rate=sampling_rate)

In [8]:
Audio(audio_values[2].cpu().numpy(), rate=sampling_rate)

### 保存到文件

In [9]:
import scipy
import numpy as np

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)
for i in range(len(audio_values)):
    scipy.io.wavfile.write(f"outputs/musicgen_out_{i}.wav", rate=sampling_rate, data=np.asarray(audio_values[0, 0].cpu(), dtype=np.float32))

Sampling rate: 32000


: 