In [4]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)
from datasets import (
    Audio,
    get_dataset_split_names,
    ClassLabel,
    Sequence,
    Dataset,
    features,
    load_dataset
)
import numpy as np
import evaluate
import torch
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
)
import numpy as np
import evaluate

In [5]:
# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
    # data_dir="./data/UrbanSound8K/audio",
)
# audio_dataset = load_dataset("marsyas/gtzan", "all")
audio_dataset
audio_dataset["train"][0]["audio"]

Resolving data files: 100%|██████████| 8732/8732 [00:00<00:00, 24803.04it/s]


{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/100032-3-0-0.wav',
 'array': array([-0.00454712, -0.00483704, -0.00460815, ..., -0.00065613,
        -0.00048828,  0.        ]),
 'sampling_rate': 44100}

In [6]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)

In [16]:
audio_dataset["test"][4]

{'audio': {'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/189981-0-0-10.wav',
  'array': array([0.02873513, 0.06427633, 0.05929738, ..., 0.06126376, 0.05624457,
         0.04361588]),
  'sampling_rate': 16000},
 'class': 0,
 'classID': 0}

In [7]:
# audio_dataset
# audio_dataset["train"]
# len(audio_dataset["train"]["classID"])
label_names = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music",
]
# label列をClassLabelにcast
class_label = ClassLabel(num_classes=len(label_names), names=label_names)
audio_dataset = audio_dataset.cast_column("class", class_label)
audio_dataset["train"].features
audio_dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 7857
    })
    test: Dataset({
        features: ['audio', 'class', 'classID'],
        num_rows: 874
    })
})

In [8]:
id2label_fn = audio_dataset["train"].features["class"].int2str

In [9]:
# import model
model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    do_normalize=True,
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [10]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/pytorch/data/test/105425-9-0-14.wav',
 'array': array([-0.01866602, -0.0220116 , -0.02827694, ..., -0.10153409,
        -0.10287194, -0.10586938]),
 'sampling_rate': 16000}

In [11]:
# preprocess dataset for adapting model
# according to: https://github.com/karolpiczak/ESC-50
max_duration = 5

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=False,
    )
    return inputs

audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio", "classID"],
    # remove_columns=["audio", "class"],
    # remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
audio_dataset_encoded = audio_dataset_encoded.rename_column("class", "label")
# audio_dataset_encoded = audio_dataset_encoded.rename_column("classID", "label")
# audio_dataset_encoded = audio_dataset_encoded.rename_column("genre", "label")
audio_dataset_encoded

Map:   0%|          | 0/7857 [00:00<?, ? examples/s]

Map: 100%|██████████| 7857/7857 [02:29<00:00, 52.60 examples/s]
Map: 100%|██████████| 874/874 [00:10<00:00, 80.50 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 7857
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 874
    })
})

In [12]:
# define label and model

# label_names = [
#     "air_conditioner",
#     "car_horn",
#     "children_playing",
#     "dog_bark",
#     "drilling",
#     "engine_idling",
#     "gun_shot",
#     "jackhammer",
#     "siren",
#     "street_music",
# ]
device = "cuda" if torch.cuda.is_available() else "cpu"

# id2label = {i: label for i, label in enumerate(label_names)}
# label2id = {v: k for k, v in id2label.items()}
# num_labels = len(label_names)

id2label = {str(i): id2label_fn(i) for i in range(len(audio_dataset_encoded["train"].features["label"].names))}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
).to(device)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# define training arguments

model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 2
training_args = TrainingArguments(
    f"{model_name}-finetuned-us8k",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [14]:
# start training
trainer.train()

  0%|          | 5/1966 [00:42<4:23:09,  8.05s/it]

{'loss': 2.4907, 'learning_rate': 1.015228426395939e-06, 'epoch': 0.01}


  1%|          | 10/1966 [01:24<4:21:45,  8.03s/it]

{'loss': 2.3848, 'learning_rate': 2.284263959390863e-06, 'epoch': 0.01}


  1%|          | 15/1966 [02:02<4:07:23,  7.61s/it]

{'loss': 2.3711, 'learning_rate': 3.5532994923857873e-06, 'epoch': 0.02}


  1%|          | 20/1966 [02:40<4:07:03,  7.62s/it]

{'loss': 2.1561, 'learning_rate': 4.822335025380711e-06, 'epoch': 0.02}


  1%|▏         | 25/1966 [03:18<4:08:53,  7.69s/it]

{'loss': 2.0338, 'learning_rate': 6.091370558375635e-06, 'epoch': 0.03}


  2%|▏         | 30/1966 [04:01<4:28:02,  8.31s/it]

{'loss': 1.8668, 'learning_rate': 7.360406091370558e-06, 'epoch': 0.03}


  2%|▏         | 35/1966 [04:49<5:01:58,  9.38s/it]

{'loss': 1.6052, 'learning_rate': 8.629441624365483e-06, 'epoch': 0.04}


  2%|▏         | 40/1966 [05:38<4:56:59,  9.25s/it]

{'loss': 1.5396, 'learning_rate': 9.898477157360408e-06, 'epoch': 0.04}


  2%|▏         | 45/1966 [06:28<4:57:31,  9.29s/it]

{'loss': 1.455, 'learning_rate': 1.116751269035533e-05, 'epoch': 0.05}


  3%|▎         | 50/1966 [07:18<4:57:23,  9.31s/it]

{'loss': 1.2432, 'learning_rate': 1.2436548223350254e-05, 'epoch': 0.05}


  3%|▎         | 55/1966 [08:07<4:54:55,  9.26s/it]

{'loss': 0.9904, 'learning_rate': 1.3705583756345178e-05, 'epoch': 0.06}


  3%|▎         | 60/1966 [08:57<4:53:45,  9.25s/it]

{'loss': 0.7145, 'learning_rate': 1.4974619289340103e-05, 'epoch': 0.06}


  3%|▎         | 65/1966 [09:47<4:55:15,  9.32s/it]

{'loss': 0.7009, 'learning_rate': 1.6243654822335024e-05, 'epoch': 0.07}


  4%|▎         | 70/1966 [10:37<4:55:50,  9.36s/it]

{'loss': 0.5097, 'learning_rate': 1.751269035532995e-05, 'epoch': 0.07}


  4%|▍         | 75/1966 [11:28<5:01:43,  9.57s/it]

{'loss': 0.634, 'learning_rate': 1.8781725888324874e-05, 'epoch': 0.08}


  4%|▍         | 80/1966 [12:18<4:58:09,  9.49s/it]

{'loss': 0.4888, 'learning_rate': 2.0050761421319797e-05, 'epoch': 0.08}


  4%|▍         | 85/1966 [13:09<4:56:38,  9.46s/it]

{'loss': 0.6283, 'learning_rate': 2.1319796954314723e-05, 'epoch': 0.09}


  5%|▍         | 90/1966 [13:59<4:52:08,  9.34s/it]

{'loss': 0.4217, 'learning_rate': 2.2588832487309646e-05, 'epoch': 0.09}


  5%|▍         | 95/1966 [14:49<4:52:10,  9.37s/it]

{'loss': 0.693, 'learning_rate': 2.385786802030457e-05, 'epoch': 0.1}


  5%|▌         | 100/1966 [15:40<4:53:20,  9.43s/it]

{'loss': 0.3811, 'learning_rate': 2.5126903553299492e-05, 'epoch': 0.1}


  5%|▌         | 105/1966 [16:30<4:52:30,  9.43s/it]

{'loss': 0.4011, 'learning_rate': 2.6395939086294418e-05, 'epoch': 0.11}


  6%|▌         | 110/1966 [17:21<4:50:45,  9.40s/it]

{'loss': 0.1926, 'learning_rate': 2.766497461928934e-05, 'epoch': 0.11}


  6%|▌         | 115/1966 [18:10<4:44:24,  9.22s/it]

{'loss': 0.5305, 'learning_rate': 2.8934010152284264e-05, 'epoch': 0.12}


  6%|▌         | 120/1966 [19:00<4:47:44,  9.35s/it]

{'loss': 0.5248, 'learning_rate': 3.020304568527919e-05, 'epoch': 0.12}


  6%|▋         | 125/1966 [19:49<4:47:15,  9.36s/it]

{'loss': 0.3666, 'learning_rate': 3.147208121827411e-05, 'epoch': 0.13}


  7%|▋         | 130/1966 [20:40<4:47:58,  9.41s/it]

{'loss': 0.7253, 'learning_rate': 3.2741116751269036e-05, 'epoch': 0.13}


  7%|▋         | 135/1966 [21:40<5:38:51, 11.10s/it]

{'loss': 0.4015, 'learning_rate': 3.401015228426396e-05, 'epoch': 0.14}


  7%|▋         | 140/1966 [22:28<4:42:22,  9.28s/it]

{'loss': 0.3175, 'learning_rate': 3.527918781725888e-05, 'epoch': 0.14}


  7%|▋         | 145/1966 [23:16<4:31:59,  8.96s/it]

{'loss': 0.2779, 'learning_rate': 3.654822335025381e-05, 'epoch': 0.15}


  8%|▊         | 150/1966 [24:04<4:30:23,  8.93s/it]

{'loss': 0.3516, 'learning_rate': 3.7817258883248735e-05, 'epoch': 0.15}


  8%|▊         | 155/1966 [24:53<4:35:29,  9.13s/it]

{'loss': 0.775, 'learning_rate': 3.9086294416243655e-05, 'epoch': 0.16}


  8%|▊         | 160/1966 [25:42<4:39:17,  9.28s/it]

{'loss': 0.7601, 'learning_rate': 4.035532994923858e-05, 'epoch': 0.16}


  8%|▊         | 165/1966 [26:36<4:59:05,  9.96s/it]

{'loss': 0.4254, 'learning_rate': 4.162436548223351e-05, 'epoch': 0.17}


  9%|▊         | 170/1966 [27:26<4:49:48,  9.68s/it]

{'loss': 0.4614, 'learning_rate': 4.289340101522843e-05, 'epoch': 0.17}


  9%|▉         | 175/1966 [28:20<5:04:00, 10.18s/it]

{'loss': 0.5663, 'learning_rate': 4.416243654822335e-05, 'epoch': 0.18}


  9%|▉         | 180/1966 [29:15<5:10:35, 10.43s/it]

{'loss': 0.5851, 'learning_rate': 4.517766497461929e-05, 'epoch': 0.18}


  9%|▉         | 185/1966 [30:09<5:06:58, 10.34s/it]

{'loss': 0.6889, 'learning_rate': 4.644670050761422e-05, 'epoch': 0.19}


 10%|▉         | 190/1966 [31:15<6:04:41, 12.32s/it]

{'loss': 0.3042, 'learning_rate': 4.771573604060914e-05, 'epoch': 0.19}


 10%|▉         | 195/1966 [32:17<5:58:45, 12.15s/it]

{'loss': 0.5727, 'learning_rate': 4.8984771573604064e-05, 'epoch': 0.2}


 10%|█         | 200/1966 [33:12<5:07:00, 10.43s/it]

{'loss': 0.6626, 'learning_rate': 4.997173544375354e-05, 'epoch': 0.2}


 10%|█         | 205/1966 [34:01<4:33:42,  9.33s/it]

{'loss': 0.3334, 'learning_rate': 4.98304126625212e-05, 'epoch': 0.21}


 11%|█         | 210/1966 [34:50<4:27:20,  9.13s/it]

{'loss': 0.502, 'learning_rate': 4.9689089881288865e-05, 'epoch': 0.21}


 11%|█         | 215/1966 [35:39<4:24:59,  9.08s/it]

{'loss': 0.2736, 'learning_rate': 4.954776710005653e-05, 'epoch': 0.22}


 11%|█         | 220/1966 [36:27<4:24:15,  9.08s/it]

{'loss': 0.7741, 'learning_rate': 4.94064443188242e-05, 'epoch': 0.22}


 11%|█▏        | 225/1966 [37:24<4:57:32, 10.25s/it]

{'loss': 0.5373, 'learning_rate': 4.9265121537591863e-05, 'epoch': 0.23}


 12%|█▏        | 230/1966 [38:14<4:34:00,  9.47s/it]

{'loss': 0.6122, 'learning_rate': 4.912379875635953e-05, 'epoch': 0.23}


 12%|█▏        | 235/1966 [39:05<4:32:22,  9.44s/it]

{'loss': 0.4687, 'learning_rate': 4.89824759751272e-05, 'epoch': 0.24}


 12%|█▏        | 240/1966 [40:04<5:20:26, 11.14s/it]

{'loss': 0.2899, 'learning_rate': 4.8841153193894855e-05, 'epoch': 0.24}


 12%|█▏        | 245/1966 [40:53<4:34:55,  9.58s/it]

{'loss': 0.1854, 'learning_rate': 4.8699830412662525e-05, 'epoch': 0.25}


 13%|█▎        | 250/1966 [41:42<4:25:49,  9.29s/it]

{'loss': 0.4782, 'learning_rate': 4.855850763143019e-05, 'epoch': 0.25}


 13%|█▎        | 255/1966 [42:32<4:23:43,  9.25s/it]

{'loss': 0.868, 'learning_rate': 4.841718485019785e-05, 'epoch': 0.26}


 13%|█▎        | 260/1966 [43:21<4:22:40,  9.24s/it]

{'loss': 0.542, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.26}


 13%|█▎        | 265/1966 [44:11<4:21:48,  9.23s/it]

{'loss': 0.3811, 'learning_rate': 4.813453928773318e-05, 'epoch': 0.27}


 14%|█▎        | 270/1966 [45:06<4:48:12, 10.20s/it]

{'loss': 0.1396, 'learning_rate': 4.799321650650085e-05, 'epoch': 0.27}


 14%|█▍        | 275/1966 [45:56<4:26:18,  9.45s/it]

{'loss': 0.3394, 'learning_rate': 4.7851893725268515e-05, 'epoch': 0.28}


 14%|█▍        | 280/1966 [46:51<4:41:38, 10.02s/it]

{'loss': 0.3855, 'learning_rate': 4.771057094403618e-05, 'epoch': 0.28}


 14%|█▍        | 285/1966 [47:41<4:25:10,  9.46s/it]

{'loss': 1.0194, 'learning_rate': 4.756924816280385e-05, 'epoch': 0.29}


 15%|█▍        | 290/1966 [48:30<4:21:33,  9.36s/it]

{'loss': 0.3222, 'learning_rate': 4.742792538157151e-05, 'epoch': 0.3}


 15%|█▌        | 295/1966 [49:20<4:18:41,  9.29s/it]

{'loss': 0.7682, 'learning_rate': 4.731486715658565e-05, 'epoch': 0.3}


 15%|█▌        | 300/1966 [50:10<4:18:25,  9.31s/it]

{'loss': 0.6657, 'learning_rate': 4.7173544375353305e-05, 'epoch': 0.31}


 16%|█▌        | 305/1966 [51:00<4:19:57,  9.39s/it]

{'loss': 0.0783, 'learning_rate': 4.7032221594120975e-05, 'epoch': 0.31}


 16%|█▌        | 310/1966 [51:50<4:21:34,  9.48s/it]

{'loss': 0.363, 'learning_rate': 4.689089881288864e-05, 'epoch': 0.32}


 16%|█▌        | 315/1966 [52:40<4:20:27,  9.47s/it]

{'loss': 0.4765, 'learning_rate': 4.67495760316563e-05, 'epoch': 0.32}


 16%|█▋        | 320/1966 [53:31<4:23:22,  9.60s/it]

{'loss': 0.5189, 'learning_rate': 4.660825325042397e-05, 'epoch': 0.33}


 17%|█▋        | 325/1966 [54:21<4:15:49,  9.35s/it]

{'loss': 0.3011, 'learning_rate': 4.646693046919164e-05, 'epoch': 0.33}


 17%|█▋        | 330/1966 [55:10<4:13:09,  9.28s/it]

{'loss': 0.794, 'learning_rate': 4.63256076879593e-05, 'epoch': 0.34}


 17%|█▋        | 335/1966 [55:59<4:10:45,  9.22s/it]

{'loss': 0.5238, 'learning_rate': 4.6184284906726965e-05, 'epoch': 0.34}


 17%|█▋        | 340/1966 [56:49<4:09:50,  9.22s/it]

{'loss': 0.5591, 'learning_rate': 4.6042962125494635e-05, 'epoch': 0.35}


 18%|█▊        | 345/1966 [57:38<4:08:55,  9.21s/it]

{'loss': 0.6256, 'learning_rate': 4.59016393442623e-05, 'epoch': 0.35}


 18%|█▊        | 350/1966 [58:28<4:09:25,  9.26s/it]

{'loss': 0.9194, 'learning_rate': 4.576031656302996e-05, 'epoch': 0.36}


 18%|█▊        | 355/1966 [59:17<4:08:40,  9.26s/it]

{'loss': 0.3829, 'learning_rate': 4.5618993781797626e-05, 'epoch': 0.36}


 18%|█▊        | 360/1966 [1:00:06<4:07:06,  9.23s/it]

{'loss': 0.6711, 'learning_rate': 4.547767100056529e-05, 'epoch': 0.37}


 19%|█▊        | 365/1966 [1:00:56<4:05:52,  9.21s/it]

{'loss': 0.3075, 'learning_rate': 4.533634821933296e-05, 'epoch': 0.37}


 19%|█▉        | 370/1966 [1:01:45<4:05:50,  9.24s/it]

{'loss': 0.2734, 'learning_rate': 4.5195025438100624e-05, 'epoch': 0.38}


 19%|█▉        | 375/1966 [1:02:34<4:04:48,  9.23s/it]

{'loss': 0.4384, 'learning_rate': 4.505370265686829e-05, 'epoch': 0.38}


 19%|█▉        | 380/1966 [1:03:24<4:04:42,  9.26s/it]

{'loss': 0.348, 'learning_rate': 4.491237987563596e-05, 'epoch': 0.39}


 20%|█▉        | 385/1966 [1:04:13<4:03:28,  9.24s/it]

{'loss': 0.2559, 'learning_rate': 4.4771057094403616e-05, 'epoch': 0.39}


 20%|█▉        | 390/1966 [1:05:03<4:03:02,  9.25s/it]

{'loss': 0.3688, 'learning_rate': 4.4629734313171286e-05, 'epoch': 0.4}


 20%|██        | 395/1966 [1:05:52<4:01:54,  9.24s/it]

{'loss': 0.2312, 'learning_rate': 4.448841153193895e-05, 'epoch': 0.4}


 20%|██        | 400/1966 [1:06:42<4:01:56,  9.27s/it]

{'loss': 0.2256, 'learning_rate': 4.4347088750706614e-05, 'epoch': 0.41}


 21%|██        | 405/1966 [1:07:31<3:59:53,  9.22s/it]

{'loss': 0.5448, 'learning_rate': 4.4205765969474284e-05, 'epoch': 0.41}


 21%|██        | 410/1966 [1:08:20<3:59:22,  9.23s/it]

{'loss': 0.6834, 'learning_rate': 4.406444318824194e-05, 'epoch': 0.42}


 21%|██        | 415/1966 [1:09:10<3:58:11,  9.21s/it]

{'loss': 0.8011, 'learning_rate': 4.392312040700961e-05, 'epoch': 0.42}


 21%|██▏       | 420/1966 [1:09:59<3:57:49,  9.23s/it]

{'loss': 0.1566, 'learning_rate': 4.3781797625777276e-05, 'epoch': 0.43}


 22%|██▏       | 425/1966 [1:10:48<3:57:06,  9.23s/it]

{'loss': 0.4086, 'learning_rate': 4.3640474844544946e-05, 'epoch': 0.43}


 22%|██▏       | 430/1966 [1:11:38<3:57:05,  9.26s/it]

{'loss': 0.5207, 'learning_rate': 4.349915206331261e-05, 'epoch': 0.44}


 22%|██▏       | 435/1966 [1:12:27<3:55:27,  9.23s/it]

{'loss': 0.548, 'learning_rate': 4.3357829282080274e-05, 'epoch': 0.44}


 22%|██▏       | 440/1966 [1:13:17<3:55:31,  9.26s/it]

{'loss': 0.1134, 'learning_rate': 4.321650650084794e-05, 'epoch': 0.45}


 23%|██▎       | 445/1966 [1:14:06<3:54:18,  9.24s/it]

{'loss': 0.5937, 'learning_rate': 4.30751837196156e-05, 'epoch': 0.45}


 23%|██▎       | 450/1966 [1:14:55<3:53:26,  9.24s/it]

{'loss': 0.8978, 'learning_rate': 4.293386093838327e-05, 'epoch': 0.46}


 23%|██▎       | 455/1966 [1:15:45<3:54:24,  9.31s/it]

{'loss': 0.4702, 'learning_rate': 4.2792538157150935e-05, 'epoch': 0.46}


 23%|██▎       | 460/1966 [1:16:35<3:52:30,  9.26s/it]

{'loss': 0.4619, 'learning_rate': 4.26512153759186e-05, 'epoch': 0.47}


 24%|██▎       | 465/1966 [1:17:24<3:51:45,  9.26s/it]

{'loss': 0.7688, 'learning_rate': 4.250989259468627e-05, 'epoch': 0.47}


 24%|██▍       | 470/1966 [1:18:13<3:50:10,  9.23s/it]

{'loss': 0.3993, 'learning_rate': 4.236856981345393e-05, 'epoch': 0.48}


 24%|██▍       | 475/1966 [1:19:03<3:49:28,  9.23s/it]

{'loss': 0.3377, 'learning_rate': 4.22272470322216e-05, 'epoch': 0.48}


 24%|██▍       | 480/1966 [1:19:52<3:49:22,  9.26s/it]

{'loss': 0.2268, 'learning_rate': 4.208592425098926e-05, 'epoch': 0.49}


 25%|██▍       | 485/1966 [1:20:42<3:48:17,  9.25s/it]

{'loss': 0.2586, 'learning_rate': 4.1944601469756925e-05, 'epoch': 0.49}


 25%|██▍       | 490/1966 [1:21:31<3:46:48,  9.22s/it]

{'loss': 0.3697, 'learning_rate': 4.1803278688524595e-05, 'epoch': 0.5}


 25%|██▌       | 495/1966 [1:22:20<3:46:20,  9.23s/it]

{'loss': 0.7545, 'learning_rate': 4.166195590729225e-05, 'epoch': 0.5}


 25%|██▌       | 500/1966 [1:23:10<3:46:27,  9.27s/it]

{'loss': 0.2125, 'learning_rate': 4.152063312605992e-05, 'epoch': 0.51}


 26%|██▌       | 505/1966 [1:23:59<3:45:05,  9.24s/it]

{'loss': 0.3292, 'learning_rate': 4.1379310344827587e-05, 'epoch': 0.51}


 26%|██▌       | 510/1966 [1:24:48<3:44:22,  9.25s/it]

{'loss': 0.0856, 'learning_rate': 4.123798756359526e-05, 'epoch': 0.52}


 26%|██▌       | 515/1966 [1:25:38<3:43:02,  9.22s/it]

{'loss': 0.4193, 'learning_rate': 4.109666478236292e-05, 'epoch': 0.52}


 26%|██▋       | 520/1966 [1:26:27<3:42:31,  9.23s/it]

{'loss': 0.2557, 'learning_rate': 4.0955342001130585e-05, 'epoch': 0.53}


 27%|██▋       | 525/1966 [1:27:16<3:41:13,  9.21s/it]

{'loss': 0.2731, 'learning_rate': 4.081401921989825e-05, 'epoch': 0.53}


 27%|██▋       | 530/1966 [1:28:06<3:41:15,  9.25s/it]

{'loss': 0.5343, 'learning_rate': 4.067269643866591e-05, 'epoch': 0.54}


 27%|██▋       | 535/1966 [1:28:55<3:39:45,  9.21s/it]

{'loss': 0.6328, 'learning_rate': 4.053137365743358e-05, 'epoch': 0.54}


 27%|██▋       | 540/1966 [1:29:44<3:39:12,  9.22s/it]

{'loss': 0.3299, 'learning_rate': 4.0390050876201246e-05, 'epoch': 0.55}


 28%|██▊       | 545/1966 [1:30:34<3:38:30,  9.23s/it]

{'loss': 0.1572, 'learning_rate': 4.024872809496891e-05, 'epoch': 0.55}


 28%|██▊       | 550/1966 [1:31:23<3:37:41,  9.22s/it]

{'loss': 0.2111, 'learning_rate': 4.0107405313736574e-05, 'epoch': 0.56}


 28%|██▊       | 555/1966 [1:32:12<3:37:05,  9.23s/it]

{'loss': 0.2271, 'learning_rate': 3.996608253250424e-05, 'epoch': 0.56}


 28%|██▊       | 560/1966 [1:33:02<3:36:34,  9.24s/it]

{'loss': 0.2677, 'learning_rate': 3.982475975127191e-05, 'epoch': 0.57}


 29%|██▊       | 565/1966 [1:33:51<3:35:34,  9.23s/it]

{'loss': 0.0674, 'learning_rate': 3.968343697003957e-05, 'epoch': 0.57}


 29%|██▉       | 570/1966 [1:34:40<3:35:08,  9.25s/it]

{'loss': 0.0255, 'learning_rate': 3.9542114188807236e-05, 'epoch': 0.58}


 29%|██▉       | 575/1966 [1:35:30<3:33:42,  9.22s/it]

{'loss': 0.0517, 'learning_rate': 3.9400791407574906e-05, 'epoch': 0.58}


 30%|██▉       | 580/1966 [1:36:19<3:33:05,  9.22s/it]

{'loss': 0.6591, 'learning_rate': 3.925946862634256e-05, 'epoch': 0.59}


 30%|██▉       | 585/1966 [1:37:08<3:32:04,  9.21s/it]

{'loss': 0.417, 'learning_rate': 3.9118145845110234e-05, 'epoch': 0.6}


 30%|███       | 590/1966 [1:37:58<3:31:55,  9.24s/it]

{'loss': 0.2383, 'learning_rate': 3.89768230638779e-05, 'epoch': 0.6}


 30%|███       | 595/1966 [1:38:47<3:30:32,  9.21s/it]

{'loss': 0.3566, 'learning_rate': 3.883550028264557e-05, 'epoch': 0.61}


 31%|███       | 600/1966 [1:39:36<3:30:37,  9.25s/it]

{'loss': 0.1252, 'learning_rate': 3.869417750141323e-05, 'epoch': 0.61}


 31%|███       | 605/1966 [1:40:26<3:29:05,  9.22s/it]

{'loss': 0.2828, 'learning_rate': 3.8552854720180896e-05, 'epoch': 0.62}


 31%|███       | 610/1966 [1:41:15<3:29:01,  9.25s/it]

{'loss': 0.2339, 'learning_rate': 3.841153193894856e-05, 'epoch': 0.62}


 31%|███▏      | 615/1966 [1:42:05<3:28:22,  9.25s/it]

{'loss': 0.2154, 'learning_rate': 3.827020915771622e-05, 'epoch': 0.63}


 32%|███▏      | 620/1966 [1:42:54<3:27:23,  9.25s/it]

{'loss': 0.8429, 'learning_rate': 3.8128886376483894e-05, 'epoch': 0.63}


 32%|███▏      | 625/1966 [1:43:43<3:26:42,  9.25s/it]

{'loss': 0.4116, 'learning_rate': 3.798756359525156e-05, 'epoch': 0.64}


 32%|███▏      | 630/1966 [1:44:33<3:25:30,  9.23s/it]

{'loss': 0.0798, 'learning_rate': 3.784624081401922e-05, 'epoch': 0.64}


 32%|███▏      | 635/1966 [1:45:22<3:24:33,  9.22s/it]

{'loss': 0.0408, 'learning_rate': 3.7704918032786885e-05, 'epoch': 0.65}


 33%|███▎      | 640/1966 [1:46:12<3:24:39,  9.26s/it]

{'loss': 0.3732, 'learning_rate': 3.756359525155455e-05, 'epoch': 0.65}


 33%|███▎      | 645/1966 [1:47:01<3:23:20,  9.24s/it]

{'loss': 0.2023, 'learning_rate': 3.742227247032222e-05, 'epoch': 0.66}


 33%|███▎      | 650/1966 [1:47:50<3:22:35,  9.24s/it]

{'loss': 0.2371, 'learning_rate': 3.728094968908988e-05, 'epoch': 0.66}


 33%|███▎      | 655/1966 [1:48:40<3:21:50,  9.24s/it]

{'loss': 0.1207, 'learning_rate': 3.713962690785755e-05, 'epoch': 0.67}


 34%|███▎      | 660/1966 [1:49:29<3:21:38,  9.26s/it]

{'loss': 0.1362, 'learning_rate': 3.699830412662522e-05, 'epoch': 0.67}


 34%|███▍      | 665/1966 [1:50:18<3:19:24,  9.20s/it]

{'loss': 0.0971, 'learning_rate': 3.6856981345392874e-05, 'epoch': 0.68}


 34%|███▍      | 670/1966 [1:51:08<3:19:25,  9.23s/it]

{'loss': 0.4678, 'learning_rate': 3.6715658564160545e-05, 'epoch': 0.68}


 34%|███▍      | 675/1966 [1:51:57<3:18:23,  9.22s/it]

{'loss': 0.4763, 'learning_rate': 3.657433578292821e-05, 'epoch': 0.69}


 35%|███▍      | 680/1966 [1:52:46<3:18:31,  9.26s/it]

{'loss': 0.3474, 'learning_rate': 3.643301300169588e-05, 'epoch': 0.69}


 35%|███▍      | 685/1966 [1:53:36<3:18:08,  9.28s/it]

{'loss': 0.3829, 'learning_rate': 3.629169022046354e-05, 'epoch': 0.7}


 35%|███▌      | 690/1966 [1:54:25<3:16:24,  9.24s/it]

{'loss': 0.1908, 'learning_rate': 3.61503674392312e-05, 'epoch': 0.7}


 35%|███▌      | 695/1966 [1:55:14<3:15:14,  9.22s/it]

{'loss': 0.0578, 'learning_rate': 3.600904465799887e-05, 'epoch': 0.71}


 36%|███▌      | 700/1966 [1:56:04<3:14:44,  9.23s/it]

{'loss': 0.3792, 'learning_rate': 3.5867721876766534e-05, 'epoch': 0.71}


 36%|███▌      | 705/1966 [1:56:53<3:14:14,  9.24s/it]

{'loss': 0.0788, 'learning_rate': 3.5726399095534205e-05, 'epoch': 0.72}


 36%|███▌      | 710/1966 [1:57:43<3:13:15,  9.23s/it]

{'loss': 0.29, 'learning_rate': 3.558507631430187e-05, 'epoch': 0.72}


 36%|███▋      | 715/1966 [1:58:32<3:12:15,  9.22s/it]

{'loss': 0.1019, 'learning_rate': 3.544375353306953e-05, 'epoch': 0.73}


 37%|███▋      | 720/1966 [1:59:21<3:11:45,  9.23s/it]

{'loss': 0.2025, 'learning_rate': 3.5302430751837196e-05, 'epoch': 0.73}


 37%|███▋      | 725/1966 [2:00:11<3:10:44,  9.22s/it]

{'loss': 0.0614, 'learning_rate': 3.516110797060486e-05, 'epoch': 0.74}


 37%|███▋      | 730/1966 [2:01:00<3:10:20,  9.24s/it]

{'loss': 0.447, 'learning_rate': 3.501978518937253e-05, 'epoch': 0.74}


 37%|███▋      | 735/1966 [2:01:49<3:09:54,  9.26s/it]

{'loss': 0.096, 'learning_rate': 3.4878462408140194e-05, 'epoch': 0.75}


 38%|███▊      | 740/1966 [2:02:39<3:09:01,  9.25s/it]

{'loss': 0.6711, 'learning_rate': 3.473713962690786e-05, 'epoch': 0.75}


 38%|███▊      | 745/1966 [2:03:28<3:08:13,  9.25s/it]

{'loss': 0.0526, 'learning_rate': 3.459581684567553e-05, 'epoch': 0.76}


 38%|███▊      | 750/1966 [2:04:18<3:07:35,  9.26s/it]

{'loss': 0.3408, 'learning_rate': 3.4454494064443185e-05, 'epoch': 0.76}


 38%|███▊      | 755/1966 [2:05:08<3:09:49,  9.41s/it]

{'loss': 0.4047, 'learning_rate': 3.4313171283210856e-05, 'epoch': 0.77}


 39%|███▊      | 760/1966 [2:05:57<3:07:45,  9.34s/it]

{'loss': 0.2621, 'learning_rate': 3.417184850197852e-05, 'epoch': 0.77}


 39%|███▉      | 765/1966 [2:06:47<3:06:46,  9.33s/it]

{'loss': 0.0649, 'learning_rate': 3.403052572074619e-05, 'epoch': 0.78}


 39%|███▉      | 770/1966 [2:07:37<3:06:29,  9.36s/it]

{'loss': 0.4385, 'learning_rate': 3.3889202939513854e-05, 'epoch': 0.78}


 39%|███▉      | 775/1966 [2:08:26<3:04:19,  9.29s/it]

{'loss': 0.1674, 'learning_rate': 3.374788015828151e-05, 'epoch': 0.79}


 40%|███▉      | 780/1966 [2:09:14<3:08:07,  9.52s/it]

{'loss': 0.0204, 'learning_rate': 3.360655737704918e-05, 'epoch': 0.79}


 40%|███▉      | 785/1966 [2:10:04<3:05:27,  9.42s/it]

{'loss': 0.1874, 'learning_rate': 3.3465234595816845e-05, 'epoch': 0.8}


 40%|████      | 790/1966 [2:11:01<3:38:56, 11.17s/it]

{'loss': 0.1412, 'learning_rate': 3.3323911814584516e-05, 'epoch': 0.8}


 40%|████      | 795/1966 [2:11:59<3:24:32, 10.48s/it]

{'loss': 0.1957, 'learning_rate': 3.318258903335218e-05, 'epoch': 0.81}


 41%|████      | 800/1966 [2:12:48<3:03:08,  9.42s/it]

{'loss': 0.0531, 'learning_rate': 3.304126625211984e-05, 'epoch': 0.81}


 41%|████      | 805/1966 [2:13:38<2:59:23,  9.27s/it]

{'loss': 0.4136, 'learning_rate': 3.289994347088751e-05, 'epoch': 0.82}


 41%|████      | 810/1966 [2:14:32<3:21:02, 10.43s/it]

{'loss': 0.2269, 'learning_rate': 3.275862068965517e-05, 'epoch': 0.82}


 41%|████▏     | 815/1966 [2:15:21<2:58:48,  9.32s/it]

{'loss': 0.4349, 'learning_rate': 3.261729790842284e-05, 'epoch': 0.83}


 42%|████▏     | 820/1966 [2:16:21<3:20:01, 10.47s/it]

{'loss': 0.0418, 'learning_rate': 3.2475975127190505e-05, 'epoch': 0.83}


 42%|████▏     | 825/1966 [2:17:09<2:54:21,  9.17s/it]

{'loss': 0.1913, 'learning_rate': 3.233465234595817e-05, 'epoch': 0.84}


 42%|████▏     | 830/1966 [2:17:57<2:49:31,  8.95s/it]

{'loss': 0.1142, 'learning_rate': 3.219332956472583e-05, 'epoch': 0.84}


 42%|████▏     | 835/1966 [2:18:45<2:47:22,  8.88s/it]

{'loss': 0.193, 'learning_rate': 3.2052006783493496e-05, 'epoch': 0.85}


 43%|████▎     | 840/1966 [2:19:33<2:47:08,  8.91s/it]

{'loss': 0.3401, 'learning_rate': 3.191068400226117e-05, 'epoch': 0.85}


 43%|████▎     | 845/1966 [2:20:20<2:45:47,  8.87s/it]

{'loss': 0.2552, 'learning_rate': 3.176936122102883e-05, 'epoch': 0.86}


 43%|████▎     | 850/1966 [2:21:08<2:45:20,  8.89s/it]

{'loss': 0.1238, 'learning_rate': 3.16280384397965e-05, 'epoch': 0.86}


 43%|████▎     | 855/1966 [2:21:56<2:44:24,  8.88s/it]

{'loss': 0.1164, 'learning_rate': 3.1486715658564165e-05, 'epoch': 0.87}


 44%|████▎     | 860/1966 [2:22:44<2:43:52,  8.89s/it]

{'loss': 0.0149, 'learning_rate': 3.134539287733182e-05, 'epoch': 0.87}


 44%|████▍     | 865/1966 [2:23:28<2:34:24,  8.41s/it]

{'loss': 0.0632, 'learning_rate': 3.120407009609949e-05, 'epoch': 0.88}


 44%|████▍     | 870/1966 [2:24:08<2:27:44,  8.09s/it]

{'loss': 0.1789, 'learning_rate': 3.1062747314867156e-05, 'epoch': 0.89}


 45%|████▍     | 875/1966 [2:24:48<2:25:39,  8.01s/it]

{'loss': 0.1523, 'learning_rate': 3.092142453363483e-05, 'epoch': 0.89}


 45%|████▍     | 880/1966 [2:25:28<2:24:46,  8.00s/it]

{'loss': 0.2684, 'learning_rate': 3.078010175240249e-05, 'epoch': 0.9}


 45%|████▌     | 885/1966 [2:26:08<2:23:53,  7.99s/it]

{'loss': 0.7595, 'learning_rate': 3.0638778971170154e-05, 'epoch': 0.9}


 45%|████▌     | 890/1966 [2:26:50<2:36:32,  8.73s/it]

{'loss': 0.2128, 'learning_rate': 3.0497456189937818e-05, 'epoch': 0.91}


 46%|████▌     | 895/1966 [2:27:39<2:40:39,  9.00s/it]

{'loss': 0.1614, 'learning_rate': 3.0356133408705485e-05, 'epoch': 0.91}


 46%|████▌     | 900/1966 [2:28:28<2:41:03,  9.07s/it]

{'loss': 0.3929, 'learning_rate': 3.021481062747315e-05, 'epoch': 0.92}


 46%|████▌     | 905/1966 [2:29:16<2:39:33,  9.02s/it]

{'loss': 0.1317, 'learning_rate': 3.0073487846240816e-05, 'epoch': 0.92}


 46%|████▋     | 910/1966 [2:30:05<2:39:13,  9.05s/it]

{'loss': 0.2647, 'learning_rate': 2.9932165065008483e-05, 'epoch': 0.93}


 47%|████▋     | 915/1966 [2:30:53<2:38:07,  9.03s/it]

{'loss': 0.1838, 'learning_rate': 2.9790842283776144e-05, 'epoch': 0.93}


 47%|████▋     | 920/1966 [2:31:42<2:37:20,  9.03s/it]

{'loss': 0.1501, 'learning_rate': 2.964951950254381e-05, 'epoch': 0.94}


 47%|████▋     | 925/1966 [2:32:31<2:37:15,  9.06s/it]

{'loss': 0.0114, 'learning_rate': 2.9508196721311478e-05, 'epoch': 0.94}


 47%|████▋     | 930/1966 [2:33:19<2:36:39,  9.07s/it]

{'loss': 0.0607, 'learning_rate': 2.936687394007914e-05, 'epoch': 0.95}


 48%|████▊     | 935/1966 [2:34:08<2:35:35,  9.06s/it]

{'loss': 0.0023, 'learning_rate': 2.922555115884681e-05, 'epoch': 0.95}


 48%|████▊     | 940/1966 [2:34:57<2:34:36,  9.04s/it]

{'loss': 0.0278, 'learning_rate': 2.9084228377614476e-05, 'epoch': 0.96}


 48%|████▊     | 945/1966 [2:35:45<2:33:45,  9.04s/it]

{'loss': 0.5414, 'learning_rate': 2.8942905596382136e-05, 'epoch': 0.96}


 48%|████▊     | 950/1966 [2:36:34<2:33:17,  9.05s/it]

{'loss': 0.1027, 'learning_rate': 2.8801582815149803e-05, 'epoch': 0.97}


 49%|████▊     | 955/1966 [2:37:23<2:32:14,  9.04s/it]

{'loss': 0.0016, 'learning_rate': 2.8660260033917467e-05, 'epoch': 0.97}


 49%|████▉     | 960/1966 [2:38:11<2:32:07,  9.07s/it]

{'loss': 0.002, 'learning_rate': 2.8518937252685134e-05, 'epoch': 0.98}


 49%|████▉     | 965/1966 [2:39:00<2:30:30,  9.02s/it]

{'loss': 0.1005, 'learning_rate': 2.83776144714528e-05, 'epoch': 0.98}


 49%|████▉     | 970/1966 [2:39:48<2:30:21,  9.06s/it]

{'loss': 0.2159, 'learning_rate': 2.8236291690220462e-05, 'epoch': 0.99}


 50%|████▉     | 975/1966 [2:40:37<2:29:09,  9.03s/it]

{'loss': 0.0357, 'learning_rate': 2.809496890898813e-05, 'epoch': 0.99}


 50%|████▉     | 980/1966 [2:41:26<2:28:35,  9.04s/it]

{'loss': 0.1089, 'learning_rate': 2.7953646127755796e-05, 'epoch': 1.0}


                                                      
 50%|█████     | 983/1966 [2:47:51<2:06:51,  7.74s/it]

{'eval_loss': 0.12016572058200836, 'eval_accuracy': 0.959954233409611, 'eval_runtime': 360.1553, 'eval_samples_per_second': 2.427, 'eval_steps_per_second': 0.305, 'epoch': 1.0}


 50%|█████     | 985/1966 [2:48:16<23:24:03, 85.87s/it] 

{'loss': 0.0694, 'learning_rate': 2.781232334652346e-05, 'epoch': 1.0}


 50%|█████     | 990/1966 [2:49:06<6:01:41, 22.24s/it] 

{'loss': 0.0455, 'learning_rate': 2.7671000565291127e-05, 'epoch': 1.01}


 50%|█████     | 992/1966 [2:49:30<4:36:28, 17.03s/it]

KeyboardInterrupt: 

In [None]:
# make convert function
# You should make classlabel yourselve to use int2str
# because label feature is manually added to csv column
# label_names = [
#     "air_conditioner",
#     "car_horn",
#     "children_playing",
#     "dog_bark",
#     "drilling",
#     "engine_idling",
#     "gun_shot",
#     "jackhammer",
#     "siren",
#     "street_music"
#     ]
# label_names
# audio_dataset = audio_dataset.cast_column("label", Sequence(ClassLabel(names=label_names)))
# # # https://github.com/huggingface/datasets/issues/5262
# id2label_fn = audio_dataset["train"].features["label"].int2str()