In this experiment, I fine-tuned a pre-trained transformer model, `DistilHuBERT`, for music genre classification using the `GTZAN` dataset, a collection of 1,000 30-second music clips across 10 genres. (I followed a tutorial in the Hugging Face Audio course, but the code here was my own. I learned the general data preprocessing and training approach and wrote my own scripts.)

With the feature extractor that came with the model, I resampled the audio to the appropriate frequency, and normalized audio data to zero mean and unit variance. The model was then fine-tuned to predict music genres from raw audio waveforms. 

For demonstration purpose, the model was trained for 10 epochs and achieved about 80% accuracy on a testing set.

In [1]:
# helper
import numpy as np
import torch

normal_repr = torch.Tensor.__repr__ 
torch.Tensor.__repr__ = lambda self: f"{self.shape}_{normal_repr(self)}"  

def info(obj, name=None):
    """Inspects an object and prints its details."""

    print("\n" + "*" * 20)
    print("Variable name: ", "<unknown>" if not name else name)
    #   print("Object name:", obj.__name__ if hasattr(obj, '__name__') else str(obj))
    print("Object type:", type(obj))

    if hasattr(obj, 'keys'):
        print("Number of keys:", len(obj.keys()))
        print("Keys:", list(obj.keys()))
    if hasattr(obj, '__len__'):
        print("Length:", len(obj))
    if isinstance(obj, (np.ndarray, torch.Tensor)):
        print("Shape:", obj.shape)
    else:
        # print("Other object type:", obj)
        pass
    print("*" * 20 + "\n")

# Example usage
# my_list = [1, 2, 3, 4]
# my_dict = {'a': 10, 'b': [1, 2, 3], 'c': np.array([1, 2, 3])}
# my_tensor = torch.randn(3, 4)

# info(my_list)
# info(my_dict)
# info(my_tensor)

In [4]:
import torch
from datasets import load_dataset

model_checkpoint = "ntu-spml/distilhubert"
dataset_id = "marsyas/gtzan"


# Loading gtzan dataset

In [94]:
dataset_gtzan_original = load_dataset(dataset_id)
dataset_gtzan_original

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

In [95]:
id2label_fn = dataset_gtzan_original["train"].features["genre"].int2str

In [14]:
dataset_gtzan_original["train"][0]

{'file': '/home/nguyenthuan49/.cache/huggingface/datasets/downloads/extracted/c96fd9bab3c30c67977bca7a5c5f8bba015190c35962bb0ab4780546891dd836/genres/blues/blues.00000.wav',
 'audio': {'path': '/home/nguyenthuan49/.cache/huggingface/datasets/downloads/extracted/c96fd9bab3c30c67977bca7a5c5f8bba015190c35962bb0ab4780546891dd836/genres/blues/blues.00000.wav',
  'array': array([ 0.00732422,  0.01660156,  0.00762939, ..., -0.05560303,
         -0.06106567, -0.06417847]),
  'sampling_rate': 22050},
 'genre': 0}

In [96]:
from datasets import DatasetDict, Dataset

dataset_gtzan_original_shuffled: Dataset = dataset_gtzan_original["train"].shuffle()   # .select(range(50))

dataset_gtzan_original_split = dataset_gtzan_original_shuffled.train_test_split(test_size=0.2, seed=42, )

dataset_gtzan_mini = dataset_gtzan_original_split

dataset_gtzan_mini

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 799
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 200
    })
})

## Some demo

In [None]:
import gradio as gr

def generate_audio(idx):
    example = dataset_gtzan_mini["train"][idx]
    audio = example["audio"]
    return (
        audio["sampling_rate"],
        audio["array"],
    ), id2label_fn(example["genre"])
    
with gr.Blocks() as demo:
    with gr.Column():
        for i in range(0, 20):
            audio, label = generate_audio(i)
            output = gr.Audio(audio, label=label)
            
demo.launch(debug=True, share=True)        

# Data preprocessing

In [97]:
from transformers import AutoFeatureExtractor
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint,
                                                         do_normalize=True,
                                                         return_attention_mask=True)
feature_extractor


Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [98]:
dataset_gtzan_train = dataset_gtzan_mini["train"]
dataset_gtzan_test = dataset_gtzan_mini["test"]

## Some testing

In [None]:
# a = (Audio(sampling_rate=feature_extractor.sampling_rate))
# info(a)

# a.__call__(dataset_gtzan_mini["train"][0]["audio"])

In [47]:
from datasets import Audio
dataset_gtzan_mini = dataset_gtzan_mini.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))

In [59]:
feature_extractor([dataset_gtzan_train["audio"][i]["array"] for i in range(2)], sampling_rate=feature_extractor.sampling_rate, truncation=True, max_length=4)

{'input_values': [array([-1.3018837 , -0.39399064,  0.25152016,  1.4443543 ], dtype=float32), array([ 0.91363186, -0.85418344, -1.1328534 ,  1.073405  ], dtype=float32)], 'attention_mask': [array([1, 1, 1, 1], dtype=int32), array([1, 1, 1, 1], dtype=int32)]}

## Preprocessing function

In [99]:
max_duration = 20

def preprocess_function(batch):
    audio_arrays = [audio_sample["array"] for audio_sample in batch["audio"]]
    encoded_batch = feature_extractor(audio_arrays,
                      truncation=True,
                      sampling_rate=feature_extractor.sampling_rate,
                      max_length=max_duration * feature_extractor.sampling_rate)
    return encoded_batch

In [100]:
dataset_gtzan_train_encoded = dataset_gtzan_train.map(preprocess_function,
                                                      batched=True,
                                                      batch_size=50,
                                                      remove_columns=["audio", "file"])
dataset_gtzan_train_encoded

Map:   0%|          | 0/799 [00:00<?, ? examples/s]

Dataset({
    features: ['genre', 'input_values', 'attention_mask'],
    num_rows: 799
})

In [101]:
dataset_gtzan_train_encoded = dataset_gtzan_train_encoded.rename_column(original_column_name="genre", new_column_name="label")


In [102]:

dataset_gtzan_train_encoded


Dataset({
    features: ['label', 'input_values', 'attention_mask'],
    num_rows: 799
})

In [103]:
dataset_gtzan_test_encoded = dataset_gtzan_test.map(preprocess_function,
                                                    batched=True,
                                                    batch_size=50,
                                                    remove_columns=["audio", "file"])

dataset_gtzan_test_encoded = dataset_gtzan_test_encoded.rename_column("genre", "label")
dataset_gtzan_test_encoded

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'input_values', 'attention_mask'],
    num_rows: 200
})

# Training

## Creating id2label and label2id, needed by the model

In [104]:
num_labels = len(dataset_gtzan_original["train"].features["genre"].names)

id2label = {
    str(i): id2label_fn(i) for i in range(num_labels)
}

label2id = {
    v: k for k, v in id2label.items()
}

id2label, label2id

({'0': 'blues',
  '1': 'classical',
  '2': 'country',
  '3': 'disco',
  '4': 'hiphop',
  '5': 'jazz',
  '6': 'metal',
  '7': 'pop',
  '8': 'reggae',
  '9': 'rock'},
 {'blues': '0',
  'classical': '1',
  'country': '2',
  'disco': '3',
  'hiphop': '4',
  'jazz': '5',
  'metal': '6',
  'pop': '7',
  'reggae': '8',
  'rock': '9'})

## Set up the model

In [105]:
from transformers import AutoModelForAudioClassification

model = AutoModelForAudioClassification.from_pretrained(model_checkpoint,
                                                        num_labels=num_labels,
                                                        id2label=id2label,
                                                        label2id=label2id)



Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at ntu-spml/distilhubert and are newly initialized: ['classifier.bias', 'classifier.weight', 'encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [106]:
from transformers import TrainingArguments

batch_size = 8
gradient_accumulation_batches = 1
num_train_epochs = 10

training_args = TrainingArguments(
    "hubert-finetuned-gtzan",
    evaluation_strategy="epoch",
    # eval_steps=10,
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_batches,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=100,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=False,
    save_total_limit=1,
)



In [107]:
import evaluate
metric = evaluate.load("accuracy")
import numpy as np 

def compute_metrics(eval_outputs):
    predictions = np.argmax(eval_outputs.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_outputs.label_ids)

In [108]:
from transformers import Trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=dataset_gtzan_train_encoded, 
    eval_dataset=dataset_gtzan_test_encoded, 
    tokenizer=feature_extractor, 
    compute_metrics=compute_metrics
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [109]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,2.2196,1.958643,0.43
2,1.6851,1.379236,0.675
3,1.2526,1.063894,0.76
4,0.9481,0.888793,0.765
5,0.7072,0.812161,0.765
6,0.5167,0.702959,0.775
7,0.372,0.676287,0.795
8,0.2851,0.675458,0.785
9,0.2078,0.618748,0.795
10,0.1706,0.642744,0.79


TrainOutput(global_step=1000, training_loss=0.8364749298095703, metrics={'train_runtime': 3656.0458, 'train_samples_per_second': 2.185, 'train_steps_per_second': 0.274, 'total_flos': 3.634450598016e+17, 'train_loss': 0.8364749298095703, 'epoch': 10.0})