In [1]:
import torch
print(f"Before clearing cache:\n{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
torch.cuda.empty_cache()
print(f"{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB\n")

Before clearing cache:
0.00 GB
0.00 GB



In [3]:
model_name_or_path = "/mnt/data/whisper-large-v2"
language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
#dataset_name = "mozilla-foundation/common_voice_11_0"

batch_size=32

In [5]:
from datasets import load_dataset, DatasetDict
import pandas as pd
from tqdm import tqdm

def load_common_voice_11_0_dataset(audiofolder, metafile, split):
    dataset = load_dataset("audiofolder", data_dir=audiofolder, split=split)
    meta = pd.read_table(metafile)
    new_column = []
    for i in tqdm(range(len(dataset))):
        new_column.append(meta[meta["path"] == dataset[i]["audio"]["path"].split("/")[-1]]["sentence"].values[0])
    dataset = dataset.add_column("sentence", new_column)

    return dataset

common_voice = DatasetDict()

common_voice["test"] = load_common_voice_11_0_dataset("/mnt/workspace/common_voice_11_0/audio/zh-CN/test", "/mnt/workspace/common_voice_11_0/transcript/zh-CN/test.tsv", "test")
common_voice["train"] = load_common_voice_11_0_dataset("/mnt/workspace/common_voice_11_0/audio/zh-CN/train", "/mnt/workspace/common_voice_11_0/transcript/zh-CN/train.tsv", "train")

#common_voice["train"] = load_dataset(dataset_name, language_abbr, split="train+validation")
#common_voice["test"] = load_dataset(dataset_name, language_abbr, split="test")

100%|██████████| 10581/10581 [01:16<00:00, 138.63it/s]
100%|██████████| 29056/29056 [03:31<00:00, 137.09it/s]


In [6]:
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path, language=language, task=task)

processor = AutoProcessor.from_pretrained(
    model_name_or_path, language=language, task=task)

In [5]:
common_voice.column_names

{'test': ['audio', 'sentence'], 'train': ['audio', 'sentence']}

In [6]:
common_voice["train"][0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/616afb8263d97d1bc72ed4c5cfd3814f88761dc484aac6018582fec4883786e2/zh-CN_train_0/common_voice_zh-CN_18531536.mp3',
  'array': array([ 0.00000000e+00,  9.86623760e-13,  1.28757139e-15, ...,
          2.35939888e-06, -9.32492367e-06, -6.35876040e-06]),
  'sampling_rate': 48000},
 'sentence': '汉元鼎六年，武帝平定南越国，南越之地重新划郡，番禺仍为南海郡治。'}

In [7]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [9]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"])

In [10]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [11]:
from transformers import AutoModelForSpeechSeq2Seq

model = AutoModelForSpeechSeq2Seq.from_pretrained("/mnt/workspace/model2/whisper-large-v2-asr-int8/checkpoint-500", load_in_8bit=True, device_map="auto")

In [12]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)



In [13]:
from peft import LoraConfig, PeftModel, LoraModel, LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none")

In [14]:
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 3,932,160 || all params: 1,547,237,120 || trainable%: 0.25414074863974306


In [15]:
from transformers import Seq2SeqTrainingArguments

# 设置序列到序列模型训练的参数
training_args = Seq2SeqTrainingArguments(
    output_dir="model2/whisper-large-v2-asr-int8",  # 指定模型输出和保存的目录
    per_device_train_batch_size=batch_size,  # 每个设备上的训练批量大小
    gradient_accumulation_steps=1,  # 梯度累积步数，在每次优化器步骤之前累积的更新步数
    learning_rate=1e-3,  # 学习率
    warmup_steps=50,  # 在训练初期增加学习率的步数，有助于稳定训练
    #max_steps=100, # 训练总步数
    num_train_epochs=3,  # 训练的总轮数
    evaluation_strategy="epoch",  # 设置评估策略，这里是在每个epoch结束时进行评估
    fp16=False,  # 启用混合精度训练，可以提高训练速度，同时减少内存使用
    per_device_eval_batch_size=batch_size,  # 每个设备上的评估批量大小
    generation_max_length=128,  # 生成任务的最大长度
    logging_steps=25,  # 指定日志记录的步骤，用于跟踪训练进度
    remove_unused_columns=False,  # 是否删除不使用的列，以减少数据处理开销
    label_names=["labels"],  # 指定标签列的名称，用于训练过程中
    save_total_limit=5
)

In [16]:
import os
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
from transformers import Seq2SeqTrainer, TrainerCallback, Seq2SeqTrainingArguments, TrainerState, TrainerControl

class SavePeftModelCallback(TrainerCallback):
    def on_save(
        self,
        args: Seq2SeqTrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        checkpoint_folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
        kwargs["model"].save_pretrained(peft_model_path)

        pytorch_model_path = os.path.join(checkpoint_folder, "pytorch_model.bin")
        if os.path.exists(pytorch_model_path):
            os.remove(pytorch_model_path)
        return control

In [17]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
    callbacks=[SavePeftModelCallback],
)
model.config.use_cache = False

Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
trainer.train()
model.save_pretrained("model2/whisper-large-v2-asr-int8-model")



Epoch,Training Loss,Validation Loss




OutOfMemoryError: CUDA out of memory. Tried to allocate 938.00 MiB. GPU 0 has a total capacty of 15.78 GiB of which 477.75 MiB is free. Process 24816 has 7.04 GiB memory in use. Process 2481 has 8.27 GiB memory in use. Of the allocated memory 4.76 GiB is allocated by PyTorch, and 2.15 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [1]:
test_audio = "data/learn-english.flac"

In [2]:
import torch
print(f"Before clearing cache:\n{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB")
torch.cuda.empty_cache()
print(f"{torch.cuda.memory_allocated() / (1024 ** 3):.2f} GB\n")

Before clearing cache:
0.00 GB
0.00 GB



In [12]:
from transformers import AutomaticSpeechRecognitionPipeline
from transformers import AutoModelForSpeechSeq2Seq

#model = "/mnt/workspace/models/whisper-large-v2-asr-int8/checkpoint-500"

model = AutoModelForSpeechSeq2Seq.from_pretrained("/mnt/workspace/model2/whisper-large-v2-asr-int8/checkpoint-500", load_in_8bit=True, device_map="auto")

pipeline = AutomaticSpeechRecognitionPipeline(model=model, tokenizer=tokenizer, feature_extractor=feature_extractor)

forced_decoder_ids = processor.get_decoder_prompt_ids(language="chinese", task=task)

In [13]:
with torch.cuda.amp.autocast():
    text = pipeline(test_audio, generate_kwargs={"forced_decoder_ids": forced_decoder_ids}, max_new_tokens=255)["text"]



In [14]:
text

"!Hello mom!Hello Carl!Hello mommy!Okay let's review what's the title of the story?What does Dina have?! Dinah has a sister."

In [15]:
import evaluate

# 词错误率（WER）是评估ASR模型常用的指标。从 Evaluate加载 WER 指标
metric = evaluate.load("/mnt/workspace/evaluate/metrics/wer")

In [16]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import gc

eval_dataloader = DataLoader(common_voice["test"], batch_size=8, collate_fn=data_collator)

model.eval()

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear8bitLt(in_features=1280, out_features=1280, bias=False)
            (v_proj): lora.Linear8bitLt(
              (base_layer): Linear8bitLt(in_features=1280, out_features=1280, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1280, out_features=8, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=8, out_features=1280, bias=False)
              )

In [17]:
for step, batch in enumerate(tqdm(eval_dataloader)):
    with torch.cuda.amp.autocast():
        with torch.no_grad():
            generated_tokens = (
                model.generate(
                    input_features=batch["input_features"].to("cuda"),
                    decoder_input_ids=batch["labels"][:, :4].to("cuda"),
                    max_new_tokens=255,
                )
                .cpu()
                .numpy()
            )
            labels = batch["labels"].cpu().numpy()
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            metric.add_batch(
                predictions=decoded_preds,
                references=decoded_labels,
            )
    del generated_tokens, labels, batch
    gc.collect()

100%|██████████| 1323/1323 [3:46:53<00:00, 10.29s/it]  


In [18]:
wer = 100 * metric.compute()
print(f"{wer=}")

wer=61.76526176526177
