### 加载模型

In [1]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

model_name = "facebook/musicgen-medium"  # 可选：small, medium, large
# 初次使用记得去掉local_files_only=True
processor = AutoProcessor.from_pretrained(model_name, local_files_only=True)
base_model = MusicgenForConditionalGeneration.from_pretrained(model_name, local_files_only=True).half().to(device)
# model.half()解决精度问题报错

from peft import PeftModel

# lora_path = "./outputs/musicgen-lora/checkpoint-1600"
lora_path = "./outputs/musicgen-lora/initial_lora"
model = PeftModel.from_pretrained(base_model, lora_path)

  from .autonotebook import tqdm as notebook_tqdm


device: cuda


### 生成

In [7]:
inputs = processor(
    text=[
        # "80s pop track with bassy drums and synth", 
        # "90s rock song with loud guitars and heavy drums", 
        "This music is an intense instrumental.The tempo is fast with vigorous violin harmony that slows down to the accompaniment of a grim Piano harmony. The music is a Ritardando and has a grim, dark, intense,serious,bleak, dreary, and dangerous vibe to it. The chord sequence is Em7b5/D, D, Dm. The beat counts to 2. The tempo of this song is 169.0 beats per minute. The key is D minor."],
    padding=True,
    return_tensors="pt",
).to(device)

inputs

{'input_ids': tensor([[  100,   723,    19,    46,  6258, 15205,     5,   634,     3, 13089,
            19,  1006,    28, 25681, 24325, 18362,    24,  2684,     7,   323,
            12,     8, 31662,    13,     3,     9, 20425, 25304, 18362,     5,
            37,   723,    19,     3,     9, 11671,   986,   232,    32,    11,
            65,     3,     9, 20425,     6,  2164,     6,  6258,     6,     7,
            49,  2936,     6,  2296,  1639,     6,     3,    26,    60,  1208,
             6,    11,  5107, 15269,    12,    34,     5,    37, 20513,  5932,
            19,  3967,   940,   115,   755,    87,   308,     6,   309,     6,
           309,    51,     5,    37,  3853, 12052,    12,  1682,    37,     3,
         13089,    13,    48,  2324,    19,     3, 27096,     5,   632,  3853,
             7,   399,  1962,     5,    37,   843,    19,   309,  4012,     5,
             1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [8]:
audio_values = model.generate(
    **inputs, 
    max_new_tokens=256, 
    )
audio_values

tensor([[[0.0152, 0.0171, 0.0202,  ..., 0.0160, 0.0176, 0.0170]]],
       device='cuda:0', dtype=torch.float16)

In [9]:
from IPython.display import Audio

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)

Sampling rate: 32000


### 试听

In [10]:
Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)

In [11]:
Audio(audio_values[1].cpu().numpy(), rate=sampling_rate)

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
Audio(audio_values[2].cpu().numpy(), rate=sampling_rate)

### 保存到文件

In [None]:
import scipy
import numpy as np

sampling_rate = model.config.audio_encoder.sampling_rate
print("Sampling rate:", sampling_rate)
for i in range(len(audio_values)):
    scipy.io.wavfile.write(f"outputs/musicgen_out_{i}.wav", rate=sampling_rate, data=np.asarray(audio_values[0, 0].cpu(), dtype=np.float32))