### 在此使用的是huggingface上的Wav2Vec模型和MInDS-14数据

In [1]:
#laod data
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

Found cached dataset minds14 (C:/Users/Administrator/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/aa40414f15e0f919231d617440192034af844835dc1e6a697f4b552e0551fd26)


In [2]:
minds

Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 563
})

In [3]:
minds = minds.train_test_split(test_size=0.2)
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

对于此音频分类任务，仅使用audio(音频信息)和intent_class(意图分类)两列数据

In [4]:
minds = minds.remove_columns(["path" , "transcription" , "english_transcription" , "lang_id"])
minds

DatasetDict({
    train: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 450
    })
    test: Dataset({
        features: ['audio', 'intent_class'],
        num_rows: 113
    })
})

In [5]:
minds["train"][0]

{'audio': {'path': 'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\e85efa4c6e33b346cbc88c38eb027c564e5070936f614259ce2c8e03006bb5b8\\en-US~APP_ERROR\\602b9b87963e11ccd901cbe8.wav',
  'array': array([ 0.        , -0.00024414,  0.        , ...,  0.00146484,
          0.00024414, -0.00024414], dtype=float32),
  'sampling_rate': 8000},
 'intent_class': 2}

audio: 语音信号的一维数组，必须调用其来加载和重新采样音频文件 <br>
intent_class: speaker的意图对应的id

#### 准备id转换字典

In [6]:
labels = minds["train"].features["intent_class"].names
labels

['abroad',
 'address',
 'app_error',
 'atm_limit',
 'balance',
 'business_loan',
 'card_issues',
 'cash_deposit',
 'direct_debit',
 'freeze',
 'high_value_payment',
 'joint_account',
 'latest_transactions',
 'pay_bill']

In [7]:
label2id , id2label = dict() , dict()
for i , label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
id2label[str(3)]

'atm_limit'

#### 数据预处理

In [8]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
feature_extractor



Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

MInDS-14数据集的采样率为8000khz，这意味着需要将数据集重新采样至16000kHz，以使用预训练的Wav2Vec2模型：

In [9]:
minds = minds.cast_column("audio" , Audio(sampling_rate=16_000)) # 转换列数据
minds["train"][0] # 从结果可以看到audio中array和sampling_rate都有变化

{'audio': {'path': 'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\e85efa4c6e33b346cbc88c38eb027c564e5070936f614259ce2c8e03006bb5b8\\en-US~APP_ERROR\\602b9b87963e11ccd901cbe8.wav',
  'array': array([-2.8438628e-05, -1.7430878e-04, -2.1596169e-04, ...,
         -1.8745610e-04, -2.7283988e-04, -1.1294879e-04], dtype=float32),
  'sampling_rate': 16000},
 'intent_class': 2}

In [10]:
# 数据预处理，使用特征提取，类似tokenizer
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate = feature_extractor.sampling_rate,
        max_length = feature_extractor.sampling_rate,
        truncation = True
    )
    return inputs

使用map函数快速转码minds中的数据

In [11]:
encoded_minds = minds.map(
    preprocess_function,
    remove_columns="audio",
    batched = True
)
encoded_minds = encoded_minds.rename_column("intent_class" , "label").with_format("torch")
encoded_minds

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 450
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 113
    })
})

#### 包装dataLoader

In [12]:
train_data = encoded_minds["train"]
test_data = encoded_minds["test"]

In [13]:
train_data.features["input_values"]

Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)

In [14]:
import torch 
from torch.utils.data import DataLoader

# 传进来的是一个batch_size的数据 list
def coffate_fn(examples):
    # print(len(examples)) 32
    inputs , targets = [] , []
    for idx in range(len(examples)):
        label = examples[idx]["label"]
        input_values = examples[idx]["input_values"]
        inputs.append(input_values) # 这两个东西本来是tensor的类型,加进去是list包torch的类型
        targets.append(label)

    return torch.stack(inputs) , torch.stack(targets)
    
trainLoader = DataLoader(
    train_data,
    batch_size=32,
    shuffle=True,
    collate_fn=coffate_fn
)
testLoader = DataLoader(test_data, batch_size=32, collate_fn=coffate_fn)

trainLoader


<torch.utils.data.dataloader.DataLoader at 0x13eb8c09040>

#### 开始训练

准备模型

In [15]:
from transformers import AutoModelForAudioClassification , TrainingArguments , Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels = num_labels,
    label2id = label2id,
    id2label = id2label
)
model

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_q.bias', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'project_hid.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.bias', 'classifier.w

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), strid

准备优化器等

In [16]:
from transformers import AdamW
import torch.nn as nn
optimizer = AdamW(model.parameters(), lr=3e-5)
CE_loss = nn.CrossEntropyLoss()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=model.to(device)

import time
#记录当前训练时间，用于记录日志和存储
timestamp = time.strftime("%m_%d_%H_%M" , time.localtime())



准备训练函数

In [17]:
from tqdm import tqdm
import os

num_epoch = 8  # 训练轮次
check_step = 4  # 用以训练中途对模型进行检验：每check_step个epoch进行一次测试和保存模型

def save_pretrained(model , path):
    # 保存模型，先利用os模块创建文件夹，后利用torch.save()写入模型文件
    os.makedirs(path , exist_ok=True)
    torch.save(model , os.path.join(path , 'wav2vec2.pth')) #这个保存着太大了

def train_and_value(discribe):
    for epoch in range(1 , num_epoch+1):
        # 记录当前epoch总Loss
        total_loss = 0

        print("-----------Train-----------")
        #训练过程
        for batch in tqdm(trainLoader , desc=f"Training Epoch {epoch}"):
            inputs, targets = [x.to(device) for x in batch]
            optimizer.zero_grad()
            # print(inputs)
            pred_output = model(inputs)["logits"]
            # print(pred_output.size()) #torch.Size([32, 14])
            loss = CE_loss(pred_output , targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print("-----------Test-----------")
        #测试过程
        acc = 0
        for batch in tqdm(testLoader , desc=f"Testing"):
            inputs, targets = [x.to(device) for x in batch]
            with torch.no_grad():
                pred_output = model(inputs)["logits"]
                acc += (pred_output.argmax(dim=1) == targets).sum().item()

        # 输出在测试集上的准确率() 测试数据一共有113条
        print(f"Acc: {acc / len(testLoader):.2f}")

        if epoch % check_step == 0:
            # 保存模型
            checkpoints_dirname = "wav2vec2_classification" + timestamp
            os.makedirs(checkpoints_dirname, exist_ok=True)
            save_pretrained(model, checkpoints_dirname + f'/{discribe}-checkpoints-{epoch}_of_{num_epoch}')


#### 数据量太少了，结果没什么看的

In [18]:
train_and_value("")

-----------Train-----------


Training Epoch 1: 100%|██████████| 15/15 [00:10<00:00,  1.39it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  8.00it/s]


Acc: 2.00
-----------Train-----------


Training Epoch 2: 100%|██████████| 15/15 [00:07<00:00,  2.04it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.46it/s]


Acc: 4.25
-----------Train-----------


Training Epoch 3: 100%|██████████| 15/15 [00:07<00:00,  1.98it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.38it/s]


Acc: 3.25
-----------Train-----------


Training Epoch 4: 100%|██████████| 15/15 [00:07<00:00,  2.04it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.84it/s]


Acc: 1.75
-----------Train-----------


Training Epoch 5: 100%|██████████| 15/15 [00:07<00:00,  2.06it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.84it/s]


Acc: 2.50
-----------Train-----------


Training Epoch 6: 100%|██████████| 15/15 [00:07<00:00,  2.06it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.74it/s]


Acc: 2.25
-----------Train-----------


Training Epoch 7: 100%|██████████| 15/15 [00:07<00:00,  2.05it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.80it/s]


Acc: 1.75
-----------Train-----------


Training Epoch 8: 100%|██████████| 15/15 [00:07<00:00,  1.98it/s]


-----------Test-----------


Testing: 100%|██████████| 4/4 [00:00<00:00,  7.52it/s]


Acc: 1.50


#### 试一下用一用

In [2]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
dataset

Found cached dataset minds14 (C:/Users/Administrator/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/aa40414f15e0f919231d617440192034af844835dc1e6a697f4b552e0551fd26)


Dataset({
    features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
    num_rows: 563
})

In [3]:
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]
audio_file

'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\e85efa4c6e33b346cbc88c38eb027c564e5070936f614259ce2c8e03006bb5b8\\en-US~JOINT_ACCOUNT\\602ba55abb1e6d0fbce92065.wav'

In [25]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
inputs = inputs.to(device)



{'input_values': tensor([[ 0.0006,  0.0027,  0.0026,  ...,  0.0007,  0.0001, -0.0003]],
       device='cuda:0')}

In [26]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-base") # 创建一个拥有model骨架的model
model = torch.load("wav2vec2_classification01_26_23_54\-checkpoints-8_of_8\wav2vec2.pth")
model = model.to(device)
with torch.no_grad():
    logits = model(**inputs).logits

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['project_q.bias', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'project_hid.weight', 'quantizer.codevectors', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.bias', 'classifier.w

In [29]:
import torch

predicted_class_ids = torch.argmax(logits).item()
predicted_label = id2label[str(predicted_class_ids)]
predicted_label

'cash_deposit'

In [31]:
audio_file

'C:\\Users\\Administrator\\.cache\\huggingface\\datasets\\downloads\\extracted\\e85efa4c6e33b346cbc88c38eb027c564e5070936f614259ce2c8e03006bb5b8\\en-US~JOINT_ACCOUNT\\602ba55abb1e6d0fbce92065.wav'

上面那个路径好像是没有访问C盘的权限

In [5]:
from playsound import playsound
from pathlib import Path

# path = Path(audio_file)
path = Path('demo_audio/602ba55abb1e6d0fbce92065.wav') # 把选中的那个demoaudio文件放过来了试试
playsound(path)