In [1]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import (
    Audio,
    get_dataset_split_names,
    ClassLabel,
    Sequence,
    Dataset,
    features,
    load_dataset
)
import numpy as np
import evaluate
import torch
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
    # data_dir="./data/UrbanSound8K/audio",
)
# audio_dataset = load_dataset("marsyas/gtzan", "all")
audio_dataset
# audio_dataset["train"][0]["audio"]

Resolving data files: 100%|██████████| 120/120 [00:00<00:00, 97240.43it/s]
Downloading data files: 100%|██████████| 120/120 [00:00<00:00, 21801.81it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 120 examples [00:00, 7997.66 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 120
    })
})

In [3]:
audio_dataset["train"][100]

{'audio': {'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/2_clapping/3-177083-A-22.wav',
  'array': array([ 0.02758789, -0.02697754, -0.0774231 , ...,  0.        ,
          0.        ,  0.        ]),
  'sampling_rate': 44100},
 'label': 2}

In [4]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)

In [5]:
audio_dataset["test"][4]

{'audio': {'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/0_pingpong/ball_racket-48.wav',
  'array': array([0.00512695, 0.01071167, 0.00949097, ..., 0.00671387, 0.00579834,
         0.00598145]),
  'sampling_rate': 16000},
 'label': 0}

In [6]:

label_names = [
    "pingpong",
    "caughing",
    "clapping",
    "footsteps"
]
class_label = ClassLabel(num_classes=len(label_names), names=label_names)
audio_dataset = audio_dataset.cast_column("label", class_label)
audio_dataset["train"].features
audio_dataset


Casting the dataset: 100%|██████████| 108/108 [00:00<00:00, 5420.55 examples/s]
Casting the dataset: 100%|██████████| 12/12 [00:00<00:00, 641.52 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 108
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 12
    })
})

In [7]:
# id2label_fn = audio_dataset["train"].features["class"].int2str
id2label_fn = audio_dataset["train"].features["label"].int2str

In [8]:
# import model
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    do_normalize=True,
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [9]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/2_clapping/2-25292-A-22.wav',
 'array': array([-5.48794342e-05, -4.16119758e-04, -3.39965554e-05, ...,
        -8.52083862e-01, -6.75716877e-01,  1.30227864e-01]),
 'sampling_rate': 16000}

In [10]:
# preprocess dataset for adapting model
# according to: https://github.com/karolpiczak/ESC-50
# 学習する音データの長さ
max_duration = 1

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=False,
    )
    return inputs

audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio"],
    # remove_columns=["audio", "classID"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
# audio_dataset_encoded = audio_dataset_encoded.rename_column("class", "label")
audio_dataset_encoded

Map: 100%|██████████| 108/108 [00:00<00:00, 112.90 examples/s]
Map: 100%|██████████| 12/12 [00:00<00:00, 135.13 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 108
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 12
    })
})

In [11]:
# define label and model
device = "cuda" if torch.cuda.is_available() else "cpu"
id2label = {str(i): id2label_fn(i) for i in range(len(audio_dataset_encoded["train"].features["label"].names))}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
).to(device)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# define training arguments
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 3
training_args = TrainingArguments(
    f"./model/pingpong-{model_name}-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [13]:
# start training
trainer.train()

 12%|█▏        | 5/42 [00:42<05:15,  8.52s/it]

{'loss': 2.1059, 'learning_rate': 3e-05, 'epoch': 0.36}


 24%|██▍       | 10/42 [01:23<04:24,  8.25s/it]

{'loss': 0.2732, 'learning_rate': 4.594594594594595e-05, 'epoch': 0.71}


                                               
 33%|███▎      | 14/42 [01:57<03:14,  6.94s/it]

{'eval_loss': 0.0007936358451843262, 'eval_accuracy': 1.0, 'eval_runtime': 5.1857, 'eval_samples_per_second': 2.314, 'eval_steps_per_second': 0.386, 'epoch': 1.0}


 36%|███▌      | 15/42 [02:08<04:19,  9.60s/it]

{'loss': 0.0021, 'learning_rate': 3.918918918918919e-05, 'epoch': 1.07}


 48%|████▊     | 20/42 [02:50<03:10,  8.64s/it]

{'loss': 0.0005, 'learning_rate': 3.2432432432432436e-05, 'epoch': 1.43}


 60%|█████▉    | 25/42 [03:32<02:24,  8.53s/it]

{'loss': 0.0002, 'learning_rate': 2.5675675675675675e-05, 'epoch': 1.79}


                                               
 67%|██████▋   | 28/42 [03:58<01:39,  7.07s/it]

{'eval_loss': 0.00015245874237734824, 'eval_accuracy': 1.0, 'eval_runtime': 5.0986, 'eval_samples_per_second': 2.354, 'eval_steps_per_second': 0.392, 'epoch': 2.0}


 71%|███████▏  | 30/42 [04:16<01:47,  8.97s/it]

{'loss': 0.0002, 'learning_rate': 1.891891891891892e-05, 'epoch': 2.14}


 83%|████████▎ | 35/42 [04:58<01:00,  8.58s/it]

{'loss': 0.0001, 'learning_rate': 1.2162162162162164e-05, 'epoch': 2.5}


 95%|█████████▌| 40/42 [05:40<00:16,  8.39s/it]

{'loss': 0.0001, 'learning_rate': 5.405405405405406e-06, 'epoch': 2.86}


                                               
100%|██████████| 42/42 [05:58<00:00,  7.19s/it]

{'eval_loss': 0.00010193387424806133, 'eval_accuracy': 1.0, 'eval_runtime': 5.1247, 'eval_samples_per_second': 2.342, 'eval_steps_per_second': 0.39, 'epoch': 3.0}


100%|██████████| 42/42 [05:59<00:00,  8.57s/it]

{'train_runtime': 359.9354, 'train_samples_per_second': 0.9, 'train_steps_per_second': 0.117, 'train_loss': 0.2836073509284428, 'epoch': 3.0}





TrainOutput(global_step=42, training_loss=0.2836073509284428, metrics={'train_runtime': 359.9354, 'train_samples_per_second': 0.9, 'train_steps_per_second': 0.117, 'train_loss': 0.2836073509284428, 'epoch': 3.0})

In [14]:
# make convert function
# You should make classlabel yourselve to use int2str
# because label feature is manually added to csv column
# label_names = [
#     "air_conditioner",
#     "car_horn",
#     "children_playing",
#     "dog_bark",
#     "drilling",
#     "engine_idling",
#     "gun_shot",
#     "jackhammer",
#     "siren",
#     "street_music"
#     ]
# label_names
# audio_dataset = audio_dataset.cast_column("label", Sequence(ClassLabel(names=label_names)))
# # # https://github.com/huggingface/datasets/issues/5262
# id2label_fn = audio_dataset["train"].features["label"].int2str()