In [70]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import Audio, get_dataset_split_names, ClassLabel, Sequence
import numpy as np
import evaluate
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

In [71]:
# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
)
audio_dataset

Resolving data files: 100%|██████████| 8733/8733 [00:00<00:00, 14114.78it/s]
Downloading data files: 100%|██████████| 8733/8733 [00:00<00:00, 16406.04it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 8732 examples [00:02, 4119.02 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 8732
    })
})

In [85]:
# make convert function
# You should make classlabel yourselve to use int2str
# because label feature is manually added to csv column
label_names = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
    ]
# label_names
# audio_dataset = audio_dataset.cast_column("label", Sequence(ClassLabel(names=label_names)))
# # # https://github.com/huggingface/datasets/issues/5262
# id2label_fn = audio_dataset["train"].features["label"].int2str()

In [73]:
# audio_dataset["train"][0]["label"]
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/100032-3-0-0.wav',
 'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
        -0.00048828,  0.        ]),
 'sampling_rate': 44100}

In [74]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
audio_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 7858
    })
    test: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 874
    })
})

In [102]:
# import model
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True, return_attention_mask=False)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [83]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/105425-9-0-14.wav',
 'array': array([-0.01866602, -0.0220116 , -0.02827694, ..., -0.10153409,
        -0.10287194, -0.10586938]),
 'sampling_rate': 16000}

In [84]:
# preprocess dataset for adapting model

# according to: https://github.com/karolpiczak/ESC-50
max_duration = 5

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        return_attention_mask=False,
    )
    return inputs


audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    batch_size=100,
    num_proc=1,
)

Map: 100%|██████████| 7858/7858 [01:21<00:00, 96.67 examples/s] 
Map: 100%|██████████| 874/874 [00:09<00:00, 94.27 examples/s] 


In [86]:
len(audio_dataset_encoded["train"][0]["input_values"])

1024

In [99]:
# define label and model

label_names = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]
device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(label_names)
# id2label[0]
model = AutoModelForAudioClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label, ignore_mismatched_sizes=True
).to(device)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [100]:
# define training arguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10
training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [101]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)
trainer.train()

  0%|          | 0/9830 [00:00<?, ?it/s]

TypeError: ASTForAudioClassification.forward() got an unexpected keyword argument 'attention_mask'