### 加载模型

In [1]:
from transformers import AutoTokenizer, MusicgenForConditionalGeneration, AutoProcessor, EncodecModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

# 这个模型是我们要训练的
model_name = "facebook/musicgen-medium"  # 可选：small, medium, large
# 初次使用记得去掉local_files_only=True
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
processor = AutoProcessor.from_pretrained(model_name, local_files_only=True)
model = MusicgenForConditionalGeneration.from_pretrained(model_name, local_files_only=True).half().to(device)
# model.half()解决精度问题报错

model.config.decoder.decoder_start_token_id = model.generation_config.decoder_start_token_id
print("decoder_start_token_id:", model.config.decoder.decoder_start_token_id)
print(model.config)

# 这个模型用于将音频转化为 tokens
encodec_model = EncodecModel.from_pretrained("facebook/encodec_32khz", local_files_only=True).to(device)
encodec_model.eval()
print(encodec_model.config)

model

  from .autonotebook import tqdm as notebook_tqdm


device: cuda
decoder_start_token_id: 2048
MusicgenConfig {
  "architectures": [
    "MusicgenForConditionalGeneration"
  ],
  "audio_encoder": {
    "_name_or_path": "facebook/encodec_32khz",
    "architectures": [
      "EncodecModel"
    ],
    "audio_channels": 1,
    "chunk_length_s": null,
    "codebook_dim": 128,
    "codebook_size": 2048,
    "compress": 2,
    "dilation_growth_rate": 2,
    "hidden_size": 128,
    "kernel_size": 7,
    "last_kernel_size": 7,
    "model_type": "encodec",
    "norm_type": "weight_norm",
    "normalize": false,
    "num_filters": 64,
    "num_lstm_layers": 2,
    "num_residual_layers": 1,
    "overlap": null,
    "pad_mode": "reflect",
    "residual_kernel_size": 3,
    "sampling_rate": 32000,
    "target_bandwidths": [
      2.2
    ],
    "torch_dtype": "float32",
    "trim_right_ratio": 1.0,
    "upsampling_ratios": [
      8,
      5,
      4,
      4
    ],
    "use_causal_conv": false,
    "use_conv_shortcut": false
  },
  "decoder": {
    "

MusicgenForConditionalGeneration(
  (text_encoder): T5EncoderModel(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): L

### 加载数据集

In [2]:
import librosa
from datasets import Dataset
import os
import numpy as np

def process_data(batch):
    # 加载音频并标准化
    audio, sr = librosa.load(os.path.join("./data/datashare/", batch["location"]), sr=32000)
    
    # 将音频转化为 tokens
    with torch.no_grad():
        encoded_frames = encodec_model.encode(torch.tensor(audio, device=device).unsqueeze(0).unsqueeze(0))  # 输出码本 tokens

    # 取出 tokens（通常是 List[Tensor]，每个 Tensor 是 [batch, num_codebooks, frames]）
    audio_tokens = torch.stack([codebook.cpu() for codebook in encoded_frames.audio_codes]).squeeze(0).squeeze(0).transpose(0, 1)
    # [frames, num_codebooks]
    
    # 将文本转化为 tokens
    inputs = tokenizer(batch["main_caption"], return_tensors="pt")
    
    return {
        "input_ids": inputs["input_ids"].squeeze(dim=0), # shape: (text_length)
        "attention_mask": inputs["attention_mask"].squeeze(dim=0), # shape: (text_length)
        # "labels": [padded_audio] # shape: (channels, text_length). Here channels=1.
        "labels": audio_tokens
    }

def my_collator(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence([torch.tensor(i["input_ids"]) for i in batch], batch_first=True)
    mask = (input_ids != 0).bool()
    labels = torch.nn.utils.rnn.pad_sequence([torch.tensor(i["labels"]) for i in batch], batch_first=True)
    return {
        "input_ids": input_ids,
        "attention_mask": mask,
        "labels": labels
    }

In [3]:
from datasets import load_dataset
dataset = load_dataset("amaai-lab/MusicBench")

dataset = dataset.map(process_data, batched=False)
# dataset = dataset["train"].train_test_split(test_size=0.2)
# dataset = dataset.train_test_split(0.2)

Map:   9%|▉         | 4876/52768 [03:58<38:16, 20.85 examples/s]  

### 设置lora参数

In [None]:
from peft import LoraConfig, get_peft_model

# 定义 LoRA 配置
lora_config = LoraConfig(
    r=8,                  # LoRA 的秩（Rank）
    lora_alpha=32,        # 缩放因子
    target_modules=["q_proj", "v_proj"],  # 目标模块（MusicGen 的注意力层）
    lora_dropout=0.05,    # Dropout 率
    bias="none",          # 不调整偏置
    task_type="CAUSAL_LM", # 因果语言模型任务
)

# 应用 LoRA
try:
    del model.peft_config
    lora_model.unload() # 用来防止之前运行得到的lora_model产生问题
    print("Old lora_model deleted.")
except:
    print("Initializing lora_model...")
    pass
lora_model = get_peft_model(model, lora_config)
lora_model.print_trainable_parameters()  # 查看可训练参数（应远小于原始模型）
lora_model.save_pretrained("./outputs/musicgen-lora/initial_lora")

### 设置训练参数

In [None]:

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./outputs/musicgen-lora",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    learning_rate=1e-4,  # LoRA 需要更高的学习率
    fp16=True,           # 混合精度训练
    logging_steps=100,
    save_steps=500,
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=my_collator
)

In [None]:
dataset["train"][0]

### 训练

In [None]:
trainer.train()