# Audio Classification - Fine Tuning

## Login

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import Libraries

In [2]:
%%capture
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate
!pip install gradio

In [3]:
from datasets import load_dataset
from datasets import Audio

from transformers import pipeline
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification
from transformers import TrainingArguments
from transformers import Trainer

import evaluate

import numpy as np
import gradio as gr
from IPython.display import Audio as IAudio

## Load Data

In [4]:
gtzan = load_dataset("marsyas/gtzan", "all")
gtzan

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

Split the data into train and test data:

In [5]:
gtzan = gtzan["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

Get a sample datapoint:

In [6]:
gtzan["train"][0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
  'array': array([ 0.10720825,  0.16122437,  0.28585815, ..., -0.22924805,
         -0.20629883, -0.11334229]),
  'sampling_rate': 22050},
 'genre': 7}

Check the targets:

In [7]:
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn

Listen to a few audio samples:

In [8]:
def generate_audio():
    example = gtzan["train"].shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label_fn(example["genre"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

#demo.launch(debug=True)



## Preprocessing

Using the AutoFeatureExtractor class from HuggingFace to normalize the data:

In [9]:
model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

The sampling rate of the dataset needs to be changed to the sampling rate of the model which is:

In [10]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

Changing the sampling rate of the dataset to the of the model:

In [11]:
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
gtzan['train'][0]['audio']['sampling_rate']

16000

Check if the feature extractor worked:

In [12]:
sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.000185, Variance: 0.0493


In [13]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")
print(f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}")

inputs keys: ['input_values', 'attention_mask']
Mean: -7.45e-09, Variance: 1.0


Now creating a function for the preprocessing that can be applied to each data sample:

In [14]:
# A song should be max 30 sec long
max_duration = 30.0

# examples is a batch of data
def preprocess_function(examples):
    # get the frequencies of each sample
    audio_arrays = [x["array"] for x in examples["audio"]]

    # normalize every input
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )

    return inputs

In [15]:
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

Rename the target column so the Trainer use the data:

In [16]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

Get the label-idx mapping:

In [17]:
id2label = {
    str(i): id2label_fn(i) for i in range(len(gtzan_encoded["train"].features["label"].names))
}

label2id = {v: k for k, v in id2label.items()}

Create the Trainer:

In [18]:
num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of the model checkpoint at ntu-spml/distilhubert were not used when initializing HubertForSequenceClassification: ['encoder.pos_conv_embed.conv.weight_v', 'encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'classifier.bias', 'projector.weight', 'encoder.pos_conv_embe

Create the Training Arguments:

In [19]:
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [20]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [21]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/timothy-geiger/distilhubert-finetuned-gtzan into local empty directory.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.8025,1.838877,0.45
2,1.2633,1.321123,0.63
3,1.0307,0.987754,0.69
4,0.7356,0.78932,0.78
5,0.4767,0.657015,0.8
6,0.3492,0.600977,0.77
7,0.2595,0.541176,0.86
8,0.294,0.51429,0.83
9,0.2106,0.53681,0.83
10,0.188,0.540814,0.83


TrainOutput(global_step=1130, training_loss=0.7796511444370303, metrics={'train_runtime': 5675.0742, 'train_samples_per_second': 1.584, 'train_steps_per_second': 0.199, 'total_flos': 6.133988274624e+17, 'train_loss': 0.7796511444370303, 'epoch': 10.0})

In [21]:
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

trainer.push_to_hub(**kwargs)