In [176]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import (
    Audio,
    get_dataset_split_names,
    ClassLabel,
    Sequence,
    Dataset,
    features
)
import numpy as np
import evaluate
import torch
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

In [184]:
# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
    # data_dir="./data/UrbanSound8K/audio",
)
# audio_dataset = load_dataset("marsyas/gtzan", "all")
audio_dataset
audio_dataset["train"][0]["audio"]

Resolving data files: 100%|██████████| 8732/8732 [00:00<00:00, 18461.92it/s]


{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/100032-3-0-0.wav',
 'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
        -0.00048828,  0.        ]),
 'sampling_rate': 44100}

In [185]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)

In [188]:
# audio_dataset
# audio_dataset["train"]
# len(audio_dataset["train"]["classID"])
label_names = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]
# label列をClassLabelにcast
class_label = ClassLabel(num_classes=len(label_names), names=label_names)
audio_dataset = audio_dataset.cast_column("class", class_label)
audio_dataset["train"].features
audio_dataset

Casting the dataset: 100%|██████████| 7857/7857 [00:00<00:00, 19068.98 examples/s]
Casting the dataset: 100%|██████████| 874/874 [00:00<00:00, 2446.81 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 7857
    })
    test: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 874
    })
})

In [191]:
id2label_fn = audio_dataset["train"].features["class"].int2str

In [192]:
# import model
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    do_normalize=True,
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [193]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/105425-9-0-14.wav',
 'array': array([-0.01866602, -0.0220116 , -0.02827694, ..., -0.10153409,
        -0.10287194, -0.10586938]),
 'sampling_rate': 16000}

In [194]:
# preprocess dataset for adapting model
# according to: https://github.com/karolpiczak/ESC-50
max_duration = 5

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=False,
    )
    return inputs

audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio", "classID"],
    # remove_columns=["audio", "class"],
    # remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
audio_dataset_encoded = audio_dataset_encoded.rename_column("class", "label")
# audio_dataset_encoded = audio_dataset_encoded.rename_column("classID", "label")
# audio_dataset_encoded = audio_dataset_encoded.rename_column("genre", "label")
audio_dataset_encoded

Map: 100%|██████████| 7857/7857 [03:14<00:00, 40.47 examples/s]
Map: 100%|██████████| 874/874 [00:21<00:00, 41.20 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 7857
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 874
    })
})

In [195]:
# define label and model

# label_names = [
#     "air_conditioner",
#     "car_horn",
#     "children_playing",
#     "dog_bark",
#     "drilling",
#     "engine_idling",
#     "gun_shot",
#     "jackhammer",
#     "siren",
#     "street_music",
# ]
device = "cuda" if torch.cuda.is_available() else "cpu"

# id2label = {i: label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}
# num_labels = len(label_names)

id2label = {str(i): id2label_fn(i) for i in range(len(audio_dataset_encoded["train"].features["label"].names))}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
).to(device)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [196]:
# define training arguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 5
training_args = TrainingArguments(
    f"{model_name}-finetuned-us8k",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

  0%|          | 1/565 [1:22:33<776:01:50, 4953.39s/it]


In [197]:
# start training
trainer.train()

  0%|          | 5/4915 [00:45<12:02:28,  8.83s/it]

{'loss': 2.2615, 'learning_rate': 4.0650406504065046e-07, 'epoch': 0.01}


  0%|          | 10/4915 [01:28<11:58:17,  8.79s/it]

{'loss': 2.4298, 'learning_rate': 9.146341463414634e-07, 'epoch': 0.01}


  0%|          | 15/4915 [02:12<11:57:32,  8.79s/it]

{'loss': 2.1259, 'learning_rate': 1.4227642276422764e-06, 'epoch': 0.02}


  0%|          | 20/4915 [02:56<11:58:04,  8.80s/it]

{'loss': 2.4506, 'learning_rate': 1.9308943089430896e-06, 'epoch': 0.02}


  1%|          | 25/4915 [03:40<11:57:52,  8.81s/it]

{'loss': 2.2919, 'learning_rate': 2.4390243902439027e-06, 'epoch': 0.03}


  1%|          | 30/4915 [04:24<11:55:44,  8.79s/it]

{'loss': 1.926, 'learning_rate': 2.9471544715447155e-06, 'epoch': 0.03}


  1%|          | 35/4915 [05:08<11:56:24,  8.81s/it]

{'loss': 1.9259, 'learning_rate': 3.4552845528455287e-06, 'epoch': 0.04}


  1%|          | 40/4915 [05:53<11:57:51,  8.84s/it]

{'loss': 1.9721, 'learning_rate': 3.9634146341463414e-06, 'epoch': 0.04}


  1%|          | 45/4915 [06:37<11:56:45,  8.83s/it]

{'loss': 1.8533, 'learning_rate': 4.471544715447155e-06, 'epoch': 0.05}


  1%|          | 50/4915 [07:21<11:58:40,  8.86s/it]

{'loss': 1.6908, 'learning_rate': 4.979674796747968e-06, 'epoch': 0.05}


  1%|          | 55/4915 [08:05<11:53:16,  8.81s/it]

{'loss': 1.5733, 'learning_rate': 5.487804878048781e-06, 'epoch': 0.06}


  1%|          | 60/4915 [08:49<11:56:20,  8.85s/it]

{'loss': 1.3703, 'learning_rate': 5.995934959349594e-06, 'epoch': 0.06}


  1%|▏         | 65/4915 [09:33<11:51:59,  8.81s/it]

{'loss': 1.2646, 'learning_rate': 6.504065040650407e-06, 'epoch': 0.07}


  1%|▏         | 70/4915 [10:17<11:50:25,  8.80s/it]

{'loss': 0.9302, 'learning_rate': 7.0121951219512205e-06, 'epoch': 0.07}


  2%|▏         | 75/4915 [11:01<11:51:05,  8.82s/it]

{'loss': 1.0857, 'learning_rate': 7.520325203252034e-06, 'epoch': 0.08}


  2%|▏         | 80/4915 [11:46<11:50:53,  8.82s/it]

{'loss': 0.9369, 'learning_rate': 8.028455284552846e-06, 'epoch': 0.08}


  2%|▏         | 85/4915 [12:30<11:49:21,  8.81s/it]

{'loss': 0.8973, 'learning_rate': 8.53658536585366e-06, 'epoch': 0.09}


  2%|▏         | 90/4915 [13:14<11:51:32,  8.85s/it]

{'loss': 0.7728, 'learning_rate': 9.044715447154472e-06, 'epoch': 0.09}


  2%|▏         | 95/4915 [13:58<11:52:50,  8.87s/it]

{'loss': 0.7987, 'learning_rate': 9.552845528455286e-06, 'epoch': 0.1}


  2%|▏         | 100/4915 [14:42<11:48:44,  8.83s/it]

{'loss': 0.4793, 'learning_rate': 1.0060975609756099e-05, 'epoch': 0.1}


  2%|▏         | 105/4915 [15:26<11:46:19,  8.81s/it]

{'loss': 0.5771, 'learning_rate': 1.0569105691056911e-05, 'epoch': 0.11}


  2%|▏         | 110/4915 [16:10<11:45:06,  8.80s/it]

{'loss': 0.4794, 'learning_rate': 1.1077235772357725e-05, 'epoch': 0.11}


  2%|▏         | 115/4915 [16:54<11:44:45,  8.81s/it]

{'loss': 0.6265, 'learning_rate': 1.1585365853658537e-05, 'epoch': 0.12}


  2%|▏         | 120/4915 [17:38<11:42:34,  8.79s/it]

{'loss': 0.4991, 'learning_rate': 1.2093495934959351e-05, 'epoch': 0.12}


  3%|▎         | 125/4915 [18:22<11:43:22,  8.81s/it]

{'loss': 0.5267, 'learning_rate': 1.2601626016260162e-05, 'epoch': 0.13}


  3%|▎         | 130/4915 [19:06<11:41:55,  8.80s/it]

{'loss': 0.5886, 'learning_rate': 1.3109756097560976e-05, 'epoch': 0.13}


  3%|▎         | 135/4915 [19:51<11:43:18,  8.83s/it]

{'loss': 0.4831, 'learning_rate': 1.3617886178861788e-05, 'epoch': 0.14}


  3%|▎         | 140/4915 [20:35<11:41:52,  8.82s/it]

{'loss': 0.3437, 'learning_rate': 1.4126016260162602e-05, 'epoch': 0.14}


  3%|▎         | 145/4915 [21:19<11:41:37,  8.83s/it]

{'loss': 0.2988, 'learning_rate': 1.4634146341463415e-05, 'epoch': 0.15}


  3%|▎         | 150/4915 [22:03<11:37:58,  8.79s/it]

{'loss': 0.4174, 'learning_rate': 1.5142276422764229e-05, 'epoch': 0.15}


  3%|▎         | 155/4915 [22:47<11:37:00,  8.79s/it]

{'loss': 0.4651, 'learning_rate': 1.565040650406504e-05, 'epoch': 0.16}


  3%|▎         | 160/4915 [23:31<11:36:26,  8.79s/it]

{'loss': 0.4635, 'learning_rate': 1.6158536585365855e-05, 'epoch': 0.16}


  3%|▎         | 165/4915 [24:15<11:34:48,  8.78s/it]

{'loss': 0.3428, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.17}


  3%|▎         | 170/4915 [24:59<11:35:18,  8.79s/it]

{'loss': 0.3466, 'learning_rate': 1.717479674796748e-05, 'epoch': 0.17}


  4%|▎         | 175/4915 [25:43<11:35:30,  8.80s/it]

{'loss': 0.2597, 'learning_rate': 1.7682926829268292e-05, 'epoch': 0.18}


In [None]:
# make convert function
# You should make classlabel yourselve to use int2str
# because label feature is manually added to csv column
# label_names = [
#     "air_conditioner",
#     "car_horn",
#     "children_playing",
#     "dog_bark",
#     "drilling",
#     "engine_idling",
#     "gun_shot",
#     "jackhammer",
#     "siren",
#     "street_music"
#     ]
# label_names
# audio_dataset = audio_dataset.cast_column("label", Sequence(ClassLabel(names=label_names)))
# # # https://github.com/huggingface/datasets/issues/5262
# id2label_fn = audio_dataset["train"].features["label"].int2str()