In [1]:
# To install from source instead of the last release, comment the command above and uncomment the following one.
# !pip install git+https://github.com/huggingface/transformers.git

# !pip install datasets
# !pip install playsound
# !pip install sounddevice
# !pip install pydub
# !pip install pyaudio
# !pip install librosa
# !pip install numpy==1.23.5
# !pip install -U accelerate

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Audio classification

## Load MInDS-14 dataset

Start by loading the MInDS-14 dataset from the 🤗 Datasets library:

In [3]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

Split the dataset's `train` split into a smaller train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset.

In [4]:
minds = minds.train_test_split(test_size=0.2)

labels = minds["train"].features["intent_class"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

print(label2id), print(id2label)

{'abroad': '0', 'address': '1', 'app_error': '2', 'atm_limit': '3', 'balance': '4', 'business_loan': '5', 'card_issues': '6', 'cash_deposit': '7', 'direct_debit': '8', 'freeze': '9', 'high_value_payment': '10', 'joint_account': '11', 'latest_transactions': '12', 'pay_bill': '13'}
{'0': 'abroad', '1': 'address', '2': 'app_error', '3': 'atm_limit', '4': 'balance', '5': 'business_loan', '6': 'card_issues', '7': 'cash_deposit', '8': 'direct_debit', '9': 'freeze', '10': 'high_value_payment', '11': 'joint_account', '12': 'latest_transactions', '13': 'pay_bill'}


(None, None)

Then take a look at the dataset:

In [5]:
minds
minds['train'][0]


{'path': 'C:\\Users\\laxmi\\.cache\\huggingface\\datasets\\downloads\\extracted\\9a5b6584a374e2f41f362b9d172ecdcbfee4e4855881d932add51b4972094b15\\en-US~APP_ERROR\\602b9a5fbb1e6d0fbce91f52.wav',
 'audio': {'path': 'C:\\Users\\laxmi\\.cache\\huggingface\\datasets\\downloads\\extracted\\9a5b6584a374e2f41f362b9d172ecdcbfee4e4855881d932add51b4972094b15\\en-US~APP_ERROR\\602b9a5fbb1e6d0fbce91f52.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
         -0.00024414, -0.00024414]),
  'sampling_rate': 8000},
 'transcription': "hi I'm just calling to talk to technical support because the app is not working on my phone it will not load My Account Details",
 'english_transcription': "hi I'm just calling to talk to technical support because the app is not working on my phone it will not load My Account Details",
 'intent_class': 2,
 'lang_id': 4}

In [6]:
# play from from audio array
import sounddevice as sd
import numpy as np

array = minds['train'][0]['audio']['array']
sd.play(array, 8000)


In [7]:
from playsound import playsound
import librosa

sound_path = 'audio/balance.wav'
# playsound(sound_path)

# Load the audio file
audio_array, sr = librosa.load(sound_path, sr=None)  # sr=None to preserve the original sample rate

audio_array, sr

(array([ 0.00024414, -0.00024414,  0.        , ...,  0.01037598,
         0.0098877 ,  0.0098877 ], dtype=float32),
 8000)

While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [remove_columns](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.remove_columns) method:

In [8]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

Take a look at an example now:

In [9]:
minds["train"][0]

{'audio': {'path': 'C:\\Users\\laxmi\\.cache\\huggingface\\datasets\\downloads\\extracted\\9a5b6584a374e2f41f362b9d172ecdcbfee4e4855881d932add51b4972094b15\\en-US~APP_ERROR\\602b9a5fbb1e6d0fbce91f52.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
         -0.00024414, -0.00024414]),
  'sampling_rate': 8000},
 'intent_class': 2}

## Preprocess

The next step is to load a Wav2Vec2 feature extractor to process the audio signal:

In [10]:
from transformers import AutoFeatureExtractor
model_ckpt = 'superb/wav2vec2-base-superb-sid'
feature_extractor = AutoFeatureExtractor.from_pretrained(model_ckpt)

The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:

In [11]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]

{'audio': {'path': 'C:\\Users\\laxmi\\.cache\\huggingface\\datasets\\downloads\\extracted\\9a5b6584a374e2f41f362b9d172ecdcbfee4e4855881d932add51b4972094b15\\en-US~APP_ERROR\\602b9a5fbb1e6d0fbce91f52.wav',
  'array': array([ 1.84403034e-05,  4.65712219e-05, -1.91250001e-05, ...,
         -2.81877990e-04, -2.33736471e-04, -1.16626805e-04]),
  'sampling_rate': 16000},
 'intent_class': 2}

Now create a preprocessing function that:

1. Calls the `audio` column to load, and if necessary, resample the audio file.
2. Checks if the sampling rate of the audio file matches the sampling rate of the audio data a model was pretrained with. You can find this information in the Wav2Vec2 [model card](https://huggingface.co/facebook/wav2vec2-base).
3. Set a maximum input length to batch longer inputs without truncating them.

In [12]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects:

In [13]:
encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
encoded_minds = encoded_minds.rename_column("intent_class", "label")

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [14]:
# !pip install evaluate

import evaluate

accuracy = evaluate.load("accuracy")

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:

In [15]:
import numpy as np


def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load Wav2Vec2 with [AutoModelForAudioClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForAudioClassification) along with the number of expected labels, and the label mappings:

In [17]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(model_ckpt, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at superb/wav2vec2-base-superb-sid and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1251, 256]) in the checkpoint and torch.Size([14, 256]) in the model instantiated
- classifier.bias: found shape torch.Size([1251]) in the checkpoint and torch.Size([14]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# model

In [19]:
training_args = TrainingArguments(
    output_dir="audio_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/30 [00:00<?, ?it/s]

RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Int'

## Inference

Great, now that you've finetuned a model, you can use it for inference!

Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:

In [None]:
audio_file

'/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav'

In [None]:
from transformers import pipeline

classifier = pipeline("audio-classification", model="audio_classification")
classifier(audio_file)

config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at stevhliu/my_awesome_minds_model were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at stevhliu/my_awesome_minds_model and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

[{'score': 0.09766869246959686, 'label': 'cash_deposit'},
 {'score': 0.07998877018690109, 'label': 'app_error'},
 {'score': 0.0781070664525032, 'label': 'joint_account'},
 {'score': 0.07667109370231628, 'label': 'pay_bill'},
 {'score': 0.0755252093076706, 'label': 'balance'}]

You can also manually replicate the results of the `pipeline` if you'd like:

Load a feature extractor to preprocess the audio file and return the `input` as PyTorch tensors: