In [2]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import (
    Audio,
    get_dataset_split_names,
    ClassLabel,
    Sequence,
    Dataset,
    features,
    load_dataset
)
import numpy as np
import evaluate
import torch
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate

In [3]:

label_names = [
    "pingpong",
    "caughing",
    "clapping",
    "silence",
]

# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
    # data_dir="./data/UrbanSound8K/audio",
)
# audio_dataset = load_dataset("marsyas/gtzan", "all")
audio_dataset
# audio_dataset["train"][0]["audio"]



Resolving data files: 100%|██████████| 160/160 [00:00<?, ?it/s]
Downloading data files: 100%|██████████| 160/160 [00:00<00:00, 22179.62it/s]
Downloading data files: 0it [00:00, ?it/s]
Extracting data files: 0it [00:00, ?it/s]
Generating train split: 160 examples [00:00, 5525.86 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 160
    })
})

In [4]:
audio_dataset["train"][100]

{'audio': {'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/data/test/2_clapping/3-177083-A-22.wav',
  'array': array([ 0.02758789, -0.02697754, -0.0774231 , ...,  0.        ,
          0.        ,  0.        ]),
  'sampling_rate': 44100},
 'label': 2}

In [5]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)

In [6]:
audio_dataset["test"][4]

{'audio': {'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/data/test/3_silence/silence-24.wav',
  'array': array([-1.04370117e-02, -1.08032227e-02, -1.09863281e-02, ...,
         -1.15966797e-03, -7.93457031e-04, -6.10351562e-05]),
  'sampling_rate': 48000},
 'label': 3}

In [7]:

class_label = ClassLabel(num_classes=len(label_names), names=label_names)
audio_dataset = audio_dataset.cast_column("label", class_label)
audio_dataset["train"].features
audio_dataset


Casting the dataset: 100%|██████████| 144/144 [00:00<00:00, 7241.44 examples/s]
Casting the dataset: 100%|██████████| 16/16 [00:00<00:00, 819.73 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 144
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 16
    })
})

In [8]:
# id2label_fn = audio_dataset["train"].features["class"].int2str
id2label_fn = audio_dataset["train"].features["label"].int2str

In [9]:
# import model
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    do_normalize=True,
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [10]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/data/test/0_pingpong/ball_racket-32.wav',
 'array': array([ 0.00305176,  0.00405884,  0.00286865, ..., -0.00485229,
        -0.0043335 , -0.00323486]),
 'sampling_rate': 16000}

In [11]:
# preprocess dataset for adapting model
# according to: https://github.com/karolpiczak/ESC-50
# 学習する音データの長さ
max_duration = 1

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=False,
    )
    return inputs

audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio"],
    # remove_columns=["audio", "classID"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
# audio_dataset_encoded = audio_dataset_encoded.rename_column("class", "label")
audio_dataset_encoded

Map: 100%|██████████| 144/144 [00:02<00:00, 63.36 examples/s]
Map: 100%|██████████| 16/16 [00:00<00:00, 71.25 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 144
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 16
    })
})

In [12]:
# define label and model
device = "cuda" if torch.cuda.is_available() else "cpu"
id2label = {str(i): id2label_fn(i) for i in range(len(audio_dataset_encoded["train"].features["label"].names))}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
).to(device)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# define training arguments
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 3
training_args = TrainingArguments(
    f"./model/pingpong-{model_name}-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [14]:
# start training
trainer.train()

  9%|▉         | 5/54 [00:50<07:22,  9.02s/it]

{'loss': 1.5481, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.28}


 19%|█▊        | 10/54 [01:32<06:11,  8.44s/it]

{'loss': 0.2194, 'learning_rate': 4.6875e-05, 'epoch': 0.56}


 28%|██▊       | 15/54 [02:14<05:31,  8.49s/it]

{'loss': 0.0051, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.83}


                                               
 33%|███▎      | 18/54 [02:47<05:11,  8.64s/it]Checkpoint destination directory ./model/pingpong-ast-finetuned-audioset-10-10-0.4593-finetuned\checkpoint-18 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.0005850121378898621, 'eval_accuracy': 1.0, 'eval_runtime': 6.9162, 'eval_samples_per_second': 2.313, 'eval_steps_per_second': 0.289, 'epoch': 1.0}


 37%|███▋      | 20/54 [03:05<05:46, 10.19s/it]

{'loss': 0.0006, 'learning_rate': 3.6458333333333336e-05, 'epoch': 1.11}


 46%|████▋     | 25/54 [03:46<04:08,  8.56s/it]

{'loss': 0.0003, 'learning_rate': 3.125e-05, 'epoch': 1.39}


 56%|█████▌    | 30/54 [04:28<03:21,  8.39s/it]

{'loss': 0.0001, 'learning_rate': 2.604166666666667e-05, 'epoch': 1.67}


 65%|██████▍   | 35/54 [05:10<02:38,  8.32s/it]

{'loss': 0.0001, 'learning_rate': 2.0833333333333336e-05, 'epoch': 1.94}


                                               
 67%|██████▋   | 36/54 [05:24<02:29,  8.28s/it]Checkpoint destination directory ./model/pingpong-ast-finetuned-audioset-10-10-0.4593-finetuned\checkpoint-36 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'eval_loss': 0.00011798739433288574, 'eval_accuracy': 1.0, 'eval_runtime': 6.8236, 'eval_samples_per_second': 2.345, 'eval_steps_per_second': 0.293, 'epoch': 2.0}


 74%|███████▍  | 40/54 [05:59<02:08,  9.16s/it]

{'loss': 0.0001, 'learning_rate': 1.5625e-05, 'epoch': 2.22}


 83%|████████▎ | 45/54 [06:42<01:17,  8.58s/it]

{'loss': 0.0001, 'learning_rate': 1.0416666666666668e-05, 'epoch': 2.5}


 93%|█████████▎| 50/54 [07:28<00:36,  9.09s/it]

{'loss': 0.0001, 'learning_rate': 5.208333333333334e-06, 'epoch': 2.78}


                                               
100%|██████████| 54/54 [08:11<00:00,  8.81s/it]

{'eval_loss': 0.00010009109973907471, 'eval_accuracy': 1.0, 'eval_runtime': 8.1021, 'eval_samples_per_second': 1.975, 'eval_steps_per_second': 0.247, 'epoch': 3.0}


100%|██████████| 54/54 [08:13<00:00,  9.15s/it]


{'train_runtime': 493.8597, 'train_samples_per_second': 0.875, 'train_steps_per_second': 0.109, 'train_loss': 0.1642560550460109, 'epoch': 3.0}


TrainOutput(global_step=54, training_loss=0.1642560550460109, metrics={'train_runtime': 493.8597, 'train_samples_per_second': 0.875, 'train_steps_per_second': 0.109, 'train_loss': 0.1642560550460109, 'epoch': 3.0})