In [40]:
from datasets import load_dataset
import os

# Custom filter to ignore _background_noise_
def filter_noise(example):
    return "_background_noise_" not in example["file"]

# Load all .wav files with directory names as labels
dataset = load_dataset(
    "audiofolder",
    data_dir="audio_data/train/audio"
)['train']

# Filter out the background noise folder
dataset = dataset.filter(lambda ex: "_background_noise_" not in ex["audio"]["path"])

In [41]:
labels = dataset.features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

def add_label_str(example):
    example["label_str"] = id2label[str(example["label"])]
    return example

dataset = dataset.map(add_label_str)
dataset[0]

{'audio': {'path': '/Users/mateuszw/files/transformers_project/audio_data/train/audio/bed/00176480_nohash_0.wav',
  'array': array([ 9.15527344e-05,  3.05175781e-05,  1.83105469e-04, ...,
         -3.05175781e-05, -9.15527344e-05,  1.22070312e-04], shape=(16000,)),
  'sampling_rate': 16000},
 'label': 1,
 'label_str': 'bed'}

In [42]:
labels = dataset.features["label"].names  # List of 32 label names (30 normal, silence, and filtered out _background_silence_)

categories_to_predict = {'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'}

known_label_ids = {i for i, name in enumerate(labels) if name in categories_to_predict}

def map_labels_str(example):
    label_str = example['label_str']
    if label_str in categories_to_predict:
        return {"label_str": label_str}
    else:
        return {"label_str": "unknown"}

def map_labels(example):
    label = example['label']
    if label in known_label_ids:
        return {"label": label}
    else:
        return {"label": -1} # unknown label

dataset = dataset.map(map_labels_str)
dataset = dataset.map(map_labels)

In [43]:
from collections import Counter
labels_list = dataset['label_str']
label_counts = Counter(labels_list)

# Print nicely
for label, count in label_counts.items():
    print(f"{label}: {count}")

unknown: 41039
down: 2359
go: 2372
left: 2353
no: 2375
off: 2357
on: 2367
right: 2367
silence: 2400
stop: 2380
up: 2375
yes: 2377


In [44]:
def load_split_list(txt_path):
    with open(txt_path, "r") as f:
        return [line.strip() for line in f.readlines()]
        
test_list = load_split_list("audio_data/train/testing_list_with_silence_balanced.txt")
val_list = load_split_list("audio_data/train/validation_list_with_silence_balanced.txt")
train_list = load_split_list("audio_data/train/training_list_with_silence_balanced.txt")

def get_split(dataset, split_list):
    
    split_set = set(split_list)

    def is_in_split(example):
        return '/'.join(example['audio']['path'].split('/')[-2:]) in split_set

    return dataset.filter(is_in_split)

val_dataset = get_split(dataset, val_list)
test_dataset = get_split(dataset, test_list)
train_dataset = get_split(dataset, train_list)

Filter: 100%|██████████| 67121/67121 [00:17<00:00, 3782.53 examples/s]
Filter: 100%|██████████| 67121/67121 [00:18<00:00, 3562.86 examples/s]
Filter: 100%|██████████| 67121/67121 [00:17<00:00, 3769.89 examples/s]


In [50]:
train_dataset[0]

{'audio': {'path': '/Users/mateuszw/files/transformers_project/audio_data/train/audio/bed/12529547_nohash_0.wav',
  'array': array([0.00244141, 0.00363159, 0.00079346, ..., 0.00112915, 0.00045776,
         0.00125122], shape=(16000,)),
  'sampling_rate': 16000},
 'label': -1,
 'label_str': 'unknown'}

In [51]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_train_dataset

Map: 100%|██████████| 22346/22346 [00:11<00:00, 2029.85 examples/s]
Map: 100%|██████████| 3011/3011 [00:01<00:00, 1523.04 examples/s]
Map: 100%|██████████| 3004/3004 [00:02<00:00, 1460.49 examples/s]


Dataset({
    features: ['label', 'input_values'],
    num_rows: 22346
})

In [52]:
import evaluate

accuracy = evaluate.load("accuracy")

# more on evaluating: https://huggingface.co/docs/evaluate/a_quick_tour

import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [53]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
training_args = TrainingArguments(
    output_dir="wav2vec2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

# More fine-tuning: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb

# 10h?

Epoch,Training Loss,Validation Loss,Accuracy
0,1.5184,1.333074,0.880771
1,0.7637,0.6191,0.892395
2,0.483,0.393047,0.893723
3,0.3895,0.298636,0.894055
4,0.2976,0.267599,0.896712


TrainOutput(global_step=870, training_loss=0.9493377063466215, metrics={'train_runtime': 7325.9812, 'train_samples_per_second': 15.251, 'train_steps_per_second': 0.119, 'total_flos': 1.0134032744448e+18, 'train_loss': 0.9493377063466215, 'epoch': 4.9957081545064375})

In [55]:
trainer.save_model('wav2vec2_final_model')

In [56]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

i = 2
training_args = TrainingArguments(
    output_dir=f"wav2vec2_{i}",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(f'wav2vec2_final_model_{i}')

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
0,2.2806,2.149126,0.792096
1,1.6332,1.487366,0.881767
2,1.3513,1.215643,0.890734
3,1.2129,1.086569,0.89007
4,1.1271,1.042052,0.890734


In [57]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

i = 3
training_args = TrainingArguments(
    output_dir=f"wav2vec2_{i}",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(f'wav2vec2_final_model_{i}')

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
0,1.0685,0.838428,0.891066
1,0.4338,0.310661,0.893059
2,0.2558,0.189701,0.896048
3,0.2235,0.14957,0.899701
4,0.1654,0.144163,0.899701


In [58]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

i = 4
training_args = TrainingArguments(
    output_dir=f"wav2vec2_{i}",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(f'wav2vec2_final_model_{i}')

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
0,0.6445,0.450856,0.874128
1,0.303,0.184015,0.885752
2,0.2147,0.125981,0.895051
3,0.1765,0.111799,0.894055
4,0.1141,0.088547,0.900697


In [59]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

i = 5
training_args = TrainingArguments(
    output_dir=f"wav2vec2_{i}",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model(f'wav2vec2_final_model_{i}')

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
0,2.238,2.239277,0.085022
1,2.2136,2.217638,0.089671
2,2.196,2.219946,0.085354
3,2.2095,2.214852,0.086682
4,2.1926,2.212704,0.08635
