# PEFT 库 LoRA 实战 - OpenAI Whisper-large-v2

In [1]:
import torch  # 导入 PyTorch 库

# 检查是否有可用的 GPU
if torch.cuda.is_available():
    print(f"GPU 可用，当前 GPU 数量: {torch.cuda.device_count()}")  # 输出可用 GPU 的数量
    print(f"当前使用的 GPU: {torch.cuda.get_device_name(0)}")      # 输出当前使用的 GPU 名称
else:
    print("未检测到可用的 GPU")  # 没有检测到可用的 GPU 时输出

GPU 可用，当前 GPU 数量: 1
当前使用的 GPU: NVIDIA GeForce RTX 3080


In [2]:
import torch
print(torch.__version__)

2.7.1+cu126


In [3]:
import torch
print(torch.backends.cudnn.enabled)  # True 表示 cuDNN 可用

True


In [4]:
model_name_or_path = "openai/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8-ja"

language = "Japanese"
language_abbr = "ja"    
task = "transcribe"
dataset_name = "mozilla-foundation/common_voice_13_0"

batch_size=8

In [5]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

In [6]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(
    "parquet",
    data_files="/mnt/e/aistudy_workspace/week02/ja_dataset/train/*.parquet",
    split="train"
)

common_voice["validation"] = load_dataset(
    "parquet",
    data_files="/mnt/e/aistudy_workspace/week02/ja_dataset/validation/*.parquet",
    split="train"  # 这里用 train 标记，后续通过 DatasetDict 区分
)

# 验证加载结果
print("训练集大小：", len(common_voice["train"]))
print("训练集第一条数据：", common_voice["train"][0])
print("验证集大小：", len(common_voice["validation"]))

训练集大小： 7071
训练集第一条数据： {'client_id': '397d4d8526a74d582e018bfa9adb687127eefc5a039c14c4a409cfbefa682e99e6466418aa155a6c1ebe1a811a29cf75fe5d9e9c56faff87bdcf6002b0bc8668', 'path': 'ja_train_0/common_voice_ja_35210899.mp3', 'audio': <datasets.features._torchcodec.AudioDecoder object at 0x7f8d36cbf590>, 'sentence': '山口県防府市', 'up_votes': 2, 'down_votes': 0, 'age': 'twenties', 'gender': 'other', 'accent': '', 'locale': 'ja', 'segment': '', 'variant': ''}
验证集大小： 4961


In [7]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())  # 检查是否支持 GPU


2.7.1+cu126
True


In [8]:
common_voice

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 7071
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4961
    })
})

# 预处理训练数据集

In [9]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

# 从预训练模型加载特征提取器
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)

# 从预训练模型加载分词器，可以指定语言和任务以获得最适合特定需求的分词器配置
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)

# 从预训练模型加载处理器，处理器通常结合了特征提取器和分词器，为特定任务提供一站式的数据预处理
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
common_voice = common_voice.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

In [11]:
common_voice["train"][0]

{'audio': <datasets.features._torchcodec.AudioDecoder at 0x7f8e8013c950>,
 'sentence': '山口県防府市',
 'variant': ''}

## 降采样音频数据

In [12]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [13]:
# sampling_rate 从 48KHZ 降为 16KHZ
common_voice["train"][0]

{'audio': <datasets.features._torchcodec.AudioDecoder at 0x7f8d020b71d0>,
 'sentence': '山口県防府市',
 'variant': ''}

## 整合以上数据处理为一个函数

In [14]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

## 数据抽样（演示需要）

In [15]:
small_common_voice = DatasetDict()

small_common_voice["train"] = common_voice["train"].shuffle(seed=16).select(range(100))
small_common_voice["validation"] = common_voice["validation"].shuffle(seed=16).select(range(50))

In [16]:
small_common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'variant'],
        num_rows: 50
    })
})

## 如果全量训练，则使用完整数据代替抽样

In [17]:
# 抽样数据处理
tokenized_common_voice = small_common_voice.map(prepare_dataset)

# 完整数据训练，尝试开启 `num_proc=16` 参数多进程并行处理（如阻塞无法运行，则不使用此参数）
# tokenized_common_voice = common_voice.map(prepare_dataset, num_proc=16)

In [18]:
tokenized_common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence', 'variant', 'input_features', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['audio', 'sentence', 'variant', 'input_features', 'labels'],
        num_rows: 50
    })
})

# 自定义语音数据整理器

In [19]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 定义一个针对语音到文本任务的数据整理器类
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any  # 处理器结合了特征提取器和分词器

    # 整理器函数，将特征列表处理成一个批次
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 从特征列表中提取输入特征，并填充以使它们具有相同的形状
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 从特征列表中提取标签特征（文本令牌），并进行填充
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 使用-100替换标签中的填充区域，-100通常用于在损失计算中忽略填充令牌
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 如果批次中的所有序列都以句子开始令牌开头，则移除它
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        # 将处理过的标签添加到批次中
        batch["labels"] = labels

        return batch  # 返回最终的批次，准备好进行训练或评估

In [20]:
# 用给定的处理器实例化数据整理器
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# 模型准备
### 加载预训练模型（int8 精度）
### 使用 int8  精度加载预训练模型，进一步降低显存需求。

In [21]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=False, device_map="auto")

In [22]:
import torch
print(torch.__version__)

2.7.1+cu126


In [23]:
# 设置模型配置中的forced_decoder_ids属性为None
model.config.forced_decoder_ids = None  # 这通常用于指定在解码（生成文本）过程中必须使用的特定token的ID，设置为None表示没有这样的强制要求

# 设置模型配置中的suppress_tokens列表为空
model.config.suppress_tokens = []  # 这用于指定在生成过程中应被抑制（不生成）的token的列表，设置为空列表表示没有要抑制的token

# PEFT 微调前的模型处理

In [24]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



# LoRA Adapter 配置

In [25]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

# 创建一个LoraConfig对象，用于设置LoRA（Low-Rank Adaptation）的配置参数
config = LoraConfig(
    r=4,  # LoRA的秩，影响LoRA矩阵的大小
    lora_alpha=64,  # LoRA适应的比例因子
    # 指定将LoRA应用到的模型模块，通常是attention和全连接层的投影。
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,  # 在LoRA模块中使用的dropout率
    bias="none",  # 设置bias的使用方式，这里没有使用bias
)

### 使用get_peft_model函数和给定的配置来获取一个PEFT模型

In [26]:
peft_model = get_peft_model(model, config)

In [27]:
# 打印 LoRA 微调训练的模型参数
peft_model.print_trainable_parameters()

trainable params: 1,966,080 || all params: 1,545,271,040 || trainable%: 0.12723204856023188


# 模型训练
## Seq2SeqTrainingArguments 训练参数
关于设置训练步数和评估步数

基于 epochs 设置：

    num_train_epochs=3,  # 训练的总轮数
    evaluation_strategy="epoch",  # 设置评估策略，这里是在每个epoch结束时进行评估
    warmup_steps=50,  # 在训练初期增加学习率的步数，有助于稳定训练
基于 steps 设置：

    max_steps=100, # 训练总步数
    evaluation_strategy="steps", 
    eval_steps=25, # 评估步数

In [28]:
from transformers import Seq2SeqTrainingArguments

# 设置序列到序列模型训练的参数
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,  # 指定模型输出和保存的目录
    per_device_train_batch_size=batch_size,  # 每个设备上的训练批量大小
    learning_rate=1e-3,  # 学习率
    num_train_epochs=1,  # 训练的总轮数
    evaluation_strategy="epoch",  # 设置评估策略，这里是在每个epoch结束时进行评估
    # warmup_steps=50,  # 在训练初期增加学习率的步数，有助于稳定训练
    # fp16=True,  # 启用混合精度训练，可以提高训练速度，同时减少内存使用
    per_device_eval_batch_size=batch_size,  # 每个设备上的评估批量大小
    generation_max_length=32,  # 生成任务的最大长度
    logging_steps=10,  # 指定日志记录的步骤，用于跟踪训练进度
    remove_unused_columns=False,  # 是否删除不使用的列，以减少数据处理开销
    label_names=["labels"],  # 指定标签列的名称，用于训练过程中
    # evaluation_strategy="steps",
    # eval_steps=25,
    fp16=True,  
    fp16_opt_level="O1", 
)



## 实例化 Seq2SeqTrainer 训练器

In [29]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=peft_model,
    train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"],
    data_collator=data_collator,
    tokenizer=processor.feature_extractor, 
)
peft_model.config.use_cache = False

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.345,0.983045


TrainOutput(global_step=13, training_loss=2.0008517045241137, metrics={'train_runtime': 2308.7898, 'train_samples_per_second': 0.043, 'train_steps_per_second': 0.006, 'total_flos': 2.126002176e+17, 'train_loss': 2.0008517045241137, 'epoch': 1.0})

# 保存 LoRA 模型(Adapter)

In [32]:
trainer.save_model(model_dir)

In [33]:
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
                  (lora_B): Mo

# 模型推理（可能需要重启 Notebook）

In [34]:
model_dir = "models/whisper-large-v2-asr-int8-fr"

language = "French"          
language_abbr = "fr"      
language_decode = "french"
task = "transcribe"

## 使用 PeftModel 加载 LoRA 微调后 Whisper 模型
### 使用 PeftConfig 加载 LoRA Adapter 配置参数，使用 PeftModel 加载微调后 Whisper 模型

In [35]:
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel

peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, model_dir)



In [36]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### 使用 Pipeline API 部署微调后 Whisper 实现中文语音识别任务

In [37]:
test_audio = "ja_test.mp3"

In [38]:
from transformers import AutomaticSpeechRecognitionPipeline

pipeline = AutomaticSpeechRecognitionPipeline(model=peft_model, tokenizer=tokenizer, feature_extractor=feature_extractor)

forced_decoder_ids = processor.get_decoder_prompt_ids(language=language_decode, task=task)

In [39]:
import torch

with torch.cuda.amp.autocast():
    text = pipeline(test_audio, max_new_tokens=255)["text"]

  with torch.cuda.amp.autocast():
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


In [40]:
text

'2008時半後でロシアのカムチャツカ半島付近で起きたマグニチュード8.8の大きな地震。震源地に近い地域では大きな揺れが観測されました。ロイター通信はロシアの地元当局の話として幼稚園の建物に被害が出ていると伝えています。また、空港などで数人が軽傷を負ったということです。AP通信は2011年3月に起きた東日本大震災以来、世界最大規模と見られる。世界的にこれより強い地震はこれまで数回しか観測されていないと報じています。'

# 上面内容的中文为：
### 2008年中旬过后，俄罗斯堪察加半岛附近发生了一场里氏8.8级的大地震。震源地附近区域观测到了强烈的震动。路透社援引俄罗斯当地当局的消息称，幼儿园建筑出现了受损情况。此外，机场等地有几人受了轻伤。美联社报道称，这是自2011年3月东日本大地震以来，被认为是世界最大规模的地震。从全球范围来看，迄今为止观测到的比这场地震更强的地震仅有几次。

## Homework
### 使用完整的数据集训练，对比 Train Loss 和 Validation Loss 变化。训练完成后，使用测试集进行模型评估.
### [Optional]使用其他语种（如：德语、法语等）的数据集进行微调训练，并进行模型评估模型评估。

# 使用测试集进行完整的模型评估

In [1]:
language = "Japanese"          
language_abbr = "ja"      
language_decode = "japanese"
task = "transcribe"

In [2]:
from transformers import AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoProcessor
from peft import PeftConfig, PeftModel
import torch 

model_dir = "models/whisper-large-v2-asr-int8-fr"

peft_config = PeftConfig.from_pretrained(model_dir)

base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, load_in_8bit=False, device_map="auto"
)

peft_model = PeftModel.from_pretrained(base_model, model_dir)

In [3]:
# --------------- 加载并预处理测试集 ---------------
from datasets import load_dataset, Audio
import evaluate
import numpy as np

# 1. 加载测试集（使用Common Voice的测试集拆分）
common_voice_test = load_dataset(
    "parquet",
    data_files="/mnt/e/aistudy_workspace/week02/ja_dataset/test/*.parquet", 
    split="train"
)

# 随机打乱测试集并仅选择前5条样本
common_voice_test = common_voice_test.shuffle(seed=42).select(range(10))

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# 2. 预处理测试集（与训练集保持一致）
# 移除无关列
common_voice_test = common_voice_test.remove_columns(
    ["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"]
)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# 音频降采样到16kHz
common_voice_test = common_voice_test.cast_column("audio", Audio(sampling_rate=16000))
# 应用数据预处理函数（与训练时相同）
def prepare_test_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    return batch
test_dataset = common_voice_test.map(prepare_test_dataset)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [7]:
# 替换原来的评估指标加载部分
from transformers import WhisperProcessor
import jiwer 
import torch

def quick_evaluate(model, test_dataset, processor, batch_size=2):
    """兼容所有jiwer版本的评估函数，不使用任何关键字参数"""
    all_predictions = []
    all_references = []
    
    # 批量处理样本
    for i in range(0, len(test_dataset), batch_size):
        batch = test_dataset[i:i+batch_size]
        inputs = {"input_features": batch["input_features"]}
        
        with torch.no_grad():
            generated_ids = model.generate(
                input_features=torch.tensor(inputs["input_features"]).to(model.device),
                max_new_tokens=255
            )
        
        # 解码预测结果和参考文本
        predictions = processor.batch_decode(generated_ids, skip_special_tokens=True)
        references = [sentence for sentence in batch["sentence"]]
        
        all_predictions.extend(predictions)
        all_references.extend(references)
        print(f"已处理 {min(i+batch_size, len(test_dataset))}/{len(test_dataset)} 条样本")
    
    # 关键修正：完全按位置传递参数（旧版jiwer仅支持这种方式）
    # 第一个参数：真实标签（all_references）
    # 第二个参数：预测结果（all_predictions）
    wer = jiwer.wer(all_references, all_predictions)
    cer = jiwer.cer(all_references, all_predictions)
    
    return {
        "wer": wer,
        "cer": cer,
        "predictions": all_predictions,
        "references": all_references
    }
    

In [8]:
# --------------- 执行评估并输出结果 ---------------
peft_model = peft_model.to("cuda" if torch.cuda.is_available() else "cpu")
peft_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
                  (lora_B): Mo

In [9]:
# 执行评估
evaluation_results = quick_evaluate(
    model=peft_model,
    test_dataset=test_dataset,
    processor=processor,
    batch_size=8
)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


已处理 8/10 条样本
已处理 10/10 条样本


In [10]:
# 输出关键指标
print(f"\n测试集评估结果：")
print(f"词错误率（WER）：{evaluation_results['wer']:.4f}")  # 越低越好，0表示完全匹配
print(f"字符错误率（CER）：{evaluation_results['cer']:.4f}")  # 越低越好


测试集评估结果：
词错误率（WER）：2.6000
字符错误率（CER）：1.5376


In [11]:
# 随机打印5个样本的预测与真实结果对比
sample_indices = np.random.choice(len(evaluation_results["predictions"]), 5, replace=False)
print("\n随机样本对比：")
for idx in sample_indices:
    print(f"\n真实文本：{evaluation_results['references'][idx]}")
    print(f"预测文本：{evaluation_results['predictions'][idx]}")


随机样本对比：

真实文本：而して斯く我々が何処までも
预测文本：しかしてかく、われわれがどこまでも。

真实文本：部屋に箱が六つ置いてあります。
预测文本：へやにはこがむつおえてあります。

真实文本：今晩友達がうちに泊まります。
预测文本：こんばんトモタチがうちにとまります。

真实文本：本能による適応は直接的である。
预测文本：本能による適応は直接的である。

真实文本：そっと階段をのぼった。
预测文本：そっと階段を登った。
