In [104]:
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    AwqConfig,
)
from datasets import Audio, ClassLabel, load_dataset
import numpy as np
import evaluate
import torch
import configparser
config_ini = configparser.ConfigParser()
config_ini.read("config.ini", encoding="utf-8")

DuplicateOptionError: While reading from 'config.ini' [line 18]: option 'model_name' in section 'DEFAULT' already exists

In [None]:

label_names = [
    "pingpong",
    "caughing",
    "clapping",
    "silence",
]

# https://huggingface.co/docs/datasets/audio_load
# metadata.csv, file1.wav, file2.wav....
audio_dataset = load_dataset(
    "audiofolder",
    data_dir="./data/test/",
    # data_dir="./data/UrbanSound8K/audio",
)
# audio_dataset = load_dataset("marsyas/gtzan", "all")
audio_dataset
# audio_dataset["train"][0]["audio"]

Resolving data files: 100%|██████████| 160/160 [00:00<00:00, 113340.42it/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 160
    })
})

In [None]:
# split dataset to train and test
audio_dataset = audio_dataset["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)

In [None]:
class_label = ClassLabel(num_classes=len(label_names), names=label_names)
audio_dataset = audio_dataset.cast_column("label", class_label)
audio_dataset["train"].features
audio_dataset


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 144
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 16
    })
})

In [None]:
# id2label_fn = audio_dataset["train"].features["class"].int2str
id2label_fn = audio_dataset["train"].features["label"].int2str

In [None]:
# import model
model_id = config_ini['DEFAULT']['model_id_for_finetuneing']
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id,
    do_normalize=True,
)
sampling_rate = feature_extractor.sampling_rate
sampling_rate

preprocessor_config.json: 100%|██████████| 215/215 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  class HfFileMetadata:


16000

In [None]:
# convert sampling rate of dataset to 16k
audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
audio_dataset["train"][0]["audio"]

{'path': 'C:/GitHub/SED_sandbox/DLAppRealTime/data/test/0_pingpong/ball_racket-32.wav',
 'array': array([ 0.00305176,  0.00405884,  0.00286865, ..., -0.00485229,
        -0.0043335 , -0.00323486]),
 'sampling_rate': 16000}

In [None]:
# preprocess dataset for adapting model
# according to: https://github.com/karolpiczak/ESC-50
# 学習する音データの長さ
max_duration = 1

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        # return_attention_mask=False,
    )
    return inputs

audio_dataset_encoded = audio_dataset.map(
    preprocess_function,
    remove_columns=["audio"],
    # remove_columns=["audio", "classID"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
# audio_dataset_encoded = audio_dataset_encoded.rename_column("class", "label")
audio_dataset_encoded

Map: 100%|██████████| 144/144 [00:00<00:00, 202.26 examples/s]
Map: 100%|██████████| 16/16 [00:00<00:00, 172.08 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 144
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 16
    })
})

In [None]:
# define label and model
device = "cuda" if torch.cuda.is_available() else "cpu"
id2label = {str(i): id2label_fn(i) for i in range(len(audio_dataset_encoded["train"].features["label"].names))}
label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
    # torch_dtype=torch.float16,
).to(device)

config.json: 100%|██████████| 2.51k/2.51k [00:00<?, ?B/s]
model.safetensors: 100%|██████████| 378M/378M [00:33<00:00, 11.3MB/s]
Some weights of the model checkpoint at dima806/music_genres_classification were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at dima806/music_genres_classifi

In [None]:
# define training arguments
model_name = model_id.split("/")[-1]
batch_size = 8
gradient_accumulation_steps = 1
num_train_epochs = 10


training_args = TrainingArguments(
    f"./model/pingpong-{model_name}-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    hub_token="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",
    push_to_hub=True,
)

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=audio_dataset_encoded["train"].with_format("torch"),
    eval_dataset=audio_dataset_encoded["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [None]:
# start training
trainer.train()


  0%|          | 0/180 [07:28<?, ?it/s]        

{'loss': 1.4434, 'learning_rate': 1.1111111111111112e-05, 'epoch': 0.28}



  0%|          | 0/180 [07:28<?, ?it/s]         


{'loss': 1.3582, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.56}


  7%|▋         | 12/180 [00:01<00:17,  9.60it/s][A
  0%|          | 0/180 [07:29<?, ?it/s]         

{'loss': 1.3156, 'learning_rate': 3.055555555555556e-05, 'epoch': 0.83}



[A

[A[A                               
                                                
  0%|          | 0/180 [07:29<?, ?it/s]
[A

{'eval_loss': 1.2557373046875, 'eval_accuracy': 0.375, 'eval_runtime': 0.0844, 'eval_samples_per_second': 189.52, 'eval_steps_per_second': 23.69, 'epoch': 1.0}



  0%|          | 0/180 [07:31<?, ?it/s]         


{'loss': 1.2382, 'learning_rate': 4.4444444444444447e-05, 'epoch': 1.11}


 12%|█▏        | 22/180 [00:04<00:41,  3.79it/s][A
  0%|          | 0/180 [07:32<?, ?it/s]         


{'loss': 1.1712, 'learning_rate': 4.9074074074074075e-05, 'epoch': 1.39}


 15%|█▌        | 27/180 [00:04<00:20,  7.39it/s][A
  0%|          | 0/180 [07:32<?, ?it/s]         


{'loss': 1.1268, 'learning_rate': 4.7530864197530866e-05, 'epoch': 1.67}


 18%|█▊        | 32/180 [00:05<00:16,  9.16it/s][A
  0%|          | 0/180 [07:33<?, ?it/s]         

{'loss': 1.1418, 'learning_rate': 4.5987654320987656e-05, 'epoch': 1.94}



[A

[A[A                               
                                                
  0%|          | 0/180 [07:33<?, ?it/s]
[A

{'eval_loss': 0.960784912109375, 'eval_accuracy': 0.625, 'eval_runtime': 0.1041, 'eval_samples_per_second': 153.705, 'eval_steps_per_second': 19.213, 'epoch': 2.0}



  0%|          | 0/180 [07:35<?, ?it/s]         

{'loss': 1.0179, 'learning_rate': 4.4444444444444447e-05, 'epoch': 2.22}



  0%|          | 0/180 [07:35<?, ?it/s]         


{'loss': 0.9313, 'learning_rate': 4.290123456790124e-05, 'epoch': 2.5}


 26%|██▌       | 47/180 [00:08<00:16,  8.18it/s][A
  0%|          | 0/180 [07:36<?, ?it/s]         

{'loss': 0.9312, 'learning_rate': 4.135802469135803e-05, 'epoch': 2.78}



[A

[A[A                               
                                                
  0%|          | 0/180 [07:36<?, ?it/s]
[A

{'eval_loss': 0.8342742919921875, 'eval_accuracy': 0.625, 'eval_runtime': 0.0842, 'eval_samples_per_second': 189.931, 'eval_steps_per_second': 23.741, 'epoch': 3.0}



  0%|          | 0/180 [07:37<?, ?it/s]         

{'loss': 0.964, 'learning_rate': 3.981481481481482e-05, 'epoch': 3.06}



  0%|          | 0/180 [07:38<?, ?it/s]         

{'loss': 0.783, 'learning_rate': 3.82716049382716e-05, 'epoch': 3.33}



  0%|          | 0/180 [07:39<?, ?it/s]         

{'loss': 0.6839, 'learning_rate': 3.67283950617284e-05, 'epoch': 3.61}



  0%|          | 0/180 [07:39<?, ?it/s]         


{'loss': 0.6728, 'learning_rate': 3.518518518518519e-05, 'epoch': 3.89}


 40%|████      | 72/180 [00:12<00:12,  8.83it/s][A
[A

[A[A                               
                                                
  0%|          | 0/180 [07:40<?, ?it/s]
[A

{'eval_loss': 0.6115798950195312, 'eval_accuracy': 0.9375, 'eval_runtime': 0.081, 'eval_samples_per_second': 197.495, 'eval_steps_per_second': 24.687, 'epoch': 4.0}



  0%|          | 0/180 [07:41<?, ?it/s]         

{'loss': 0.5707, 'learning_rate': 3.364197530864198e-05, 'epoch': 4.17}



  0%|          | 0/180 [07:42<?, ?it/s]         


{'loss': 0.6076, 'learning_rate': 3.209876543209876e-05, 'epoch': 4.44}


 46%|████▌     | 82/180 [00:14<00:12,  7.64it/s][A
  0%|          | 0/180 [07:42<?, ?it/s]         


{'loss': 0.6148, 'learning_rate': 3.055555555555556e-05, 'epoch': 4.72}


 48%|████▊     | 87/180 [00:15<00:10,  9.09it/s][A
  0%|          | 0/180 [07:43<?, ?it/s]         

{'loss': 0.7242, 'learning_rate': 2.9320987654320992e-05, 'epoch': 5.0}



[A

[A[A                               
                                                
  0%|          | 0/180 [07:43<?, ?it/s]
[A

{'eval_loss': 0.6493301391601562, 'eval_accuracy': 0.8125, 'eval_runtime': 0.0711, 'eval_samples_per_second': 224.928, 'eval_steps_per_second': 28.116, 'epoch': 5.0}



  0%|          | 0/180 [07:45<?, ?it/s]         


{'loss': 0.5649, 'learning_rate': 2.777777777777778e-05, 'epoch': 5.28}


 54%|█████▍    | 97/180 [00:17<00:13,  6.00it/s][A
  0%|          | 0/180 [07:45<?, ?it/s]          


{'loss': 0.5224, 'learning_rate': 2.623456790123457e-05, 'epoch': 5.56}


 57%|█████▋    | 102/180 [00:18<00:09,  8.46it/s][A
  0%|          | 0/180 [07:46<?, ?it/s]          

{'loss': 0.5707, 'learning_rate': 2.4691358024691357e-05, 'epoch': 5.83}



[A

[A[A                               
                                                 
  0%|          | 0/180 [07:46<?, ?it/s]
[A

{'eval_loss': 0.5224800109863281, 'eval_accuracy': 0.875, 'eval_runtime': 0.0868, 'eval_samples_per_second': 184.306, 'eval_steps_per_second': 23.038, 'epoch': 6.0}



  0%|          | 0/180 [07:47<?, ?it/s]          


{'loss': 0.4865, 'learning_rate': 2.314814814814815e-05, 'epoch': 6.11}


 62%|██████▏   | 112/180 [00:20<00:15,  4.31it/s][A
  0%|          | 0/180 [07:48<?, ?it/s]          

{'loss': 0.4902, 'learning_rate': 2.1604938271604937e-05, 'epoch': 6.39}



  0%|          | 0/180 [07:49<?, ?it/s]          


{'loss': 0.3766, 'learning_rate': 2.006172839506173e-05, 'epoch': 6.67}


 68%|██████▊   | 122/180 [00:21<00:06,  8.84it/s][A
  0%|          | 0/180 [07:49<?, ?it/s]          

{'loss': 0.3726, 'learning_rate': 1.8518518518518518e-05, 'epoch': 6.94}



[A

[A[A                               
                                                 
  0%|          | 0/180 [07:49<?, ?it/s]
[A

{'eval_loss': 0.2706298828125, 'eval_accuracy': 1.0, 'eval_runtime': 0.0863, 'eval_samples_per_second': 185.502, 'eval_steps_per_second': 23.188, 'epoch': 7.0}



  0%|          | 0/180 [07:51<?, ?it/s]          


{'loss': 0.5325, 'learning_rate': 1.697530864197531e-05, 'epoch': 7.22}


 73%|███████▎  | 132/180 [00:24<00:08,  5.89it/s][A
  0%|          | 0/180 [07:51<?, ?it/s]          

{'loss': 0.4047, 'learning_rate': 1.54320987654321e-05, 'epoch': 7.5}



  0%|          | 0/180 [07:52<?, ?it/s]          

{'loss': 0.3227, 'learning_rate': 1.388888888888889e-05, 'epoch': 7.78}



[A

[A[A                               
                                                 
  0%|          | 0/180 [07:52<?, ?it/s]
[A

{'eval_loss': 0.3758735656738281, 'eval_accuracy': 0.9375, 'eval_runtime': 0.1014, 'eval_samples_per_second': 157.78, 'eval_steps_per_second': 19.723, 'epoch': 8.0}



  0%|          | 0/180 [07:54<?, ?it/s]          

{'loss': 0.3792, 'learning_rate': 1.2345679012345678e-05, 'epoch': 8.06}



  0%|          | 0/180 [07:54<?, ?it/s]          

{'loss': 0.2411, 'learning_rate': 1.0802469135802469e-05, 'epoch': 8.33}



  0%|          | 0/180 [07:55<?, ?it/s]          

{'loss': 0.3123, 'learning_rate': 9.259259259259259e-06, 'epoch': 8.61}



  0%|          | 0/180 [07:55<?, ?it/s]          

{'loss': 0.1951, 'learning_rate': 7.71604938271605e-06, 'epoch': 8.89}



[A

[A[A                               
                                                 
  0%|          | 0/180 [07:56<?, ?it/s]
[A

{'eval_loss': 0.1656341552734375, 'eval_accuracy': 1.0, 'eval_runtime': 0.0776, 'eval_samples_per_second': 206.265, 'eval_steps_per_second': 25.783, 'epoch': 9.0}



  0%|          | 0/180 [07:57<?, ?it/s]          


{'loss': 0.1958, 'learning_rate': 6.172839506172839e-06, 'epoch': 9.17}


 93%|█████████▎| 167/180 [00:30<00:02,  5.24it/s][A
  0%|          | 0/180 [07:58<?, ?it/s]          


{'loss': 0.2469, 'learning_rate': 4.6296296296296296e-06, 'epoch': 9.44}


 96%|█████████▌| 172/180 [00:30<00:00,  8.21it/s][A
  0%|          | 0/180 [07:58<?, ?it/s]          

{'loss': 0.1863, 'learning_rate': 3.0864197530864196e-06, 'epoch': 9.72}



  0%|          | 0/180 [07:59<?, ?it/s]          

{'loss': 0.1796, 'learning_rate': 1.5432098765432098e-06, 'epoch': 10.0}



[A

[A[A                               
                                                 
  0%|          | 0/180 [07:59<?, ?it/s]
[A

{'eval_loss': 0.14742469787597656, 'eval_accuracy': 1.0, 'eval_runtime': 0.0735, 'eval_samples_per_second': 217.636, 'eval_steps_per_second': 27.204, 'epoch': 10.0}



100%|██████████| 180/180 [00:33<00:00,  5.42it/s]


{'train_runtime': 33.2384, 'train_samples_per_second': 43.323, 'train_steps_per_second': 5.415, 'train_loss': 0.6632374657524956, 'epoch': 10.0}


TrainOutput(global_step=180, training_loss=0.6632374657524956, metrics={'train_runtime': 33.2384, 'train_samples_per_second': 43.323, 'train_steps_per_second': 5.415, 'train_loss': 0.6632374657524956, 'epoch': 10.0})