# Audio Classification - Fine Tuning

## Login

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Import Libraries

In [4]:
%%capture
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install accelerate
!pip install gradio

In [5]:
from datasets import load_dataset
from datasets import Audio

from transformers import pipeline
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoConfig

import evaluate

import numpy as np
import gradio as gr
from IPython.display import Audio as IAudio

## Load Data

In [6]:
gtzan = load_dataset("marsyas/gtzan", "all")
gtzan

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.35k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.42k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

Split the data into train and test data:

In [7]:
gtzan = gtzan["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

Get a sample datapoint:

In [8]:
gtzan["train"][0]

{'file': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
 'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/5022b0984afa7334ff9a3c60566280b08b5179d4ac96a628052bada7d8940244/genres/pop/pop.00098.wav',
  'array': array([ 0.10720825,  0.16122437,  0.28585815, ..., -0.22924805,
         -0.20629883, -0.11334229]),
  'sampling_rate': 22050},
 'genre': 7}

Check the targets:

In [9]:
id2label_fn = gtzan["train"].features["genre"].int2str
id2label_fn

In [10]:
id2label_fn(2)

'country'

Listen to a few audio samples:

In [11]:
def generate_audio():
    example = gtzan["train"].shuffle()[0]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label_fn(example["genre"])


with gr.Blocks() as demo:
    with gr.Column():
        for _ in range(4):
            audio, label = generate_audio()
            output = gr.Audio(audio, label=label)

#demo.launch(debug=True)



## Preprocessing

Using the AutoFeatureExtractor class from HuggingFace to normalize the data:

In [26]:
model_id = "ntu-spml/distilhubert"

feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True, return_attention_mask=True
)

The sampling rate of the dataset needs to be changed to the sampling rate of the model which is:

In [27]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

Changing the sampling rate of the dataset to the of the model:

In [28]:
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=sampling_rate))
gtzan['train'][0]['audio']['sampling_rate']

16000

Check if the feature extractor worked:

In [29]:
sample = gtzan["train"][0]["audio"]

print(f"Mean: {np.mean(sample['array']):.3}, Variance: {np.var(sample['array']):.3}")

Mean: 0.000185, Variance: 0.0493


In [30]:
inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])

print(f"inputs keys: {list(inputs.keys())}")
print(f"Mean: {np.mean(inputs['input_values']):.3}, Variance: {np.var(inputs['input_values']):.3}")

inputs keys: ['input_values', 'attention_mask']
Mean: -7.45e-09, Variance: 1.0


Now creating a function for the preprocessing that can be applied to each data sample:

In [45]:
# A song should be max 30 sec long
max_duration = 15.0

# examples is a batch of data
def preprocess_function(examples):
    # get the frequencies of each sample
    audio_arrays = [x["array"] for x in examples["audio"]]

    # normalize every input
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=True,
    )

    return inputs

In [46]:
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

gtzan_encoded

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 899
    })
    test: Dataset({
        features: ['genre', 'input_values', 'attention_mask'],
        num_rows: 100
    })
})

Rename the target column so the Trainer use the data:

In [47]:
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

Get the label-idx mapping:

In [48]:
id2label = {
    str(i): id2label_fn(i) for i in range(len(gtzan_encoded["train"].features["label"].names))
}

label2id = {v: k for k, v in id2label.items()}

Create the Trainer:

In [49]:
num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create the Training Arguments:

In [50]:
model_name = model_id.split("/")[-1]
batch_size = 2
gradient_accumulation_steps = 1
num_train_epochs = 15

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

In [51]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [52]:
import torch
torch.cuda.empty_cache()

In [53]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.7495,1.716787,0.52
2,1.1633,1.051472,0.66
3,0.3792,0.731247,0.73
4,0.5365,0.970741,0.75
5,0.0234,1.112398,0.75
6,0.0039,0.971746,0.82
7,0.1781,1.049078,0.82
8,0.0009,1.194575,0.83
9,0.0007,1.11159,0.84
10,0.0004,1.081387,0.85


Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-450 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-900 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-1350 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-1800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-2250 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory distilhubert-finetuned-gtzan/checkpoint-2700 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint d

TrainOutput(global_step=6750, training_loss=0.3887180247924946, metrics={'train_runtime': 4391.894, 'train_samples_per_second': 3.07, 'train_steps_per_second': 1.537, 'total_flos': 4.600491205968e+17, 'train_loss': 0.3887180247924946, 'epoch': 15.0})

In [54]:
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

trainer.push_to_hub(**kwargs)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

events.out.tfevents.1709849546.3009a6145419.440.3:   0%|          | 0.00/296k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/timothy-geiger/distilhubert-finetuned-gtzan/commit/ab0d96954c36c6d8d254db2b814a2c84d158a49e', commit_message='End of training', commit_description='', oid='ab0d96954c36c6d8d254db2b814a2c84d158a49e', pr_url=None, pr_revision=None, pr_num=None)