In [1]:
from datasets import load_dataset
import os

# Custom filter to ignore _background_noise_
def filter_noise(example):
    return "_background_noise_" not in example["file"]

# Load all .wav files with directory names as labels
dataset = load_dataset(
    
    "audiofolder",
    data_dir="audio_data/train/audio"
)['train']

# Filter out the background noise folder
dataset = dataset.filter(lambda ex: "_background_noise_" not in ex["audio"]["path"])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(dataset[0])
labels = dataset.features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

def add_label_str(example):
    example["label_str"] = id2label[str(example["label"])]
    return example

dataset = dataset.map(add_label_str)
dataset[0]

{'audio': {'path': '/Users/mateuszw/files/transformers_project/audio_data/train/audio/bed/00176480_nohash_0.wav', 'array': array([ 9.15527344e-05,  3.05175781e-05,  1.83105469e-04, ...,
       -3.05175781e-05, -9.15527344e-05,  1.22070312e-04], shape=(16000,)), 'sampling_rate': 16000}, 'label': 1}


{'audio': {'path': '/Users/mateuszw/files/transformers_project/audio_data/train/audio/bed/00176480_nohash_0.wav',
  'array': array([ 9.15527344e-05,  3.05175781e-05,  1.83105469e-04, ...,
         -3.05175781e-05, -9.15527344e-05,  1.22070312e-04], shape=(16000,)),
  'sampling_rate': 16000},
 'label': 1,
 'label_str': 'bed'}

In [3]:
import os

def load_split_list(txt_path):
    with open(txt_path, "r") as f:
        return [line.strip() for line in f.readlines()]
        
test_list = load_split_list("audio_data/train/testing_list.txt")
val_list = load_split_list("audio_data/train/validation_list.txt")

def get_split(dataset, split_list):
    
    split_set = set(split_list)

    def is_in_split(example):
        return '/'.join(example['audio']['path'].split('/')[-2:]) in split_set

    return dataset.filter(is_in_split)

val_dataset = get_split(dataset, val_list)
test_dataset = get_split(dataset, test_list)

# Train = everything else
used_paths = set(val_list + test_list)
train_dataset = dataset.filter(
    lambda example: '/'.join(example['audio']['path'].split('/')[-2:]) not in used_paths
)

Filter: 100%|██████████| 64721/64721 [00:12<00:00, 5202.89 examples/s]
Filter: 100%|██████████| 64721/64721 [00:07<00:00, 9145.64 examples/s]
Filter: 100%|██████████| 64721/64721 [00:07<00:00, 8737.27 examples/s]


In [4]:
print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['audio', 'label', 'label_str'],
    num_rows: 51088
})
Dataset({
    features: ['audio', 'label', 'label_str'],
    num_rows: 6835
})
Dataset({
    features: ['audio', 'label', 'label_str'],
    num_rows: 6798
})


In [5]:
train_dataset[0]

{'audio': {'path': '/Users/mateuszw/files/transformers_project/audio_data/train/audio/bed/00176480_nohash_0.wav',
  'array': array([ 9.15527344e-05,  3.05175781e-05,  1.83105469e-04, ...,
         -3.05175781e-05, -9.15527344e-05,  1.22070312e-04], shape=(16000,)),
  'sampling_rate': 16000},
 'label': 1,
 'label_str': 'bed'}

In [12]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns=['audio', 'label_str'], batched=True)
encoded_train_dataset



Dataset({
    features: ['label', 'input_values'],
    num_rows: 51088
})

In [13]:
import evaluate

accuracy = evaluate.load("accuracy")

# more on evaluating: https://huggingface.co/docs/evaluate/a_quick_tour

import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [14]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="wav2vec2",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    processing_class=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

# More fine-tuning: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb

# 10h?

Epoch,Training Loss,Validation Loss,Accuracy
1,1.4206,1.30132,0.956899
2,0.5429,0.436563,0.959841
3,0.3577,0.237641,0.966314
4,0.2471,0.17565,0.968667
5,0.2567,0.154245,0.972051
6,0.1944,0.135382,0.971462
7,0.161,0.123315,0.973816
8,0.1718,0.120812,0.973227
9,0.1557,0.116433,0.973816


TrainOutput(global_step=3990, training_loss=0.5617014444860301, metrics={'train_runtime': 51073.0545, 'train_samples_per_second': 10.003, 'train_steps_per_second': 0.078, 'total_flos': 4.627852693019136e+18, 'train_loss': 0.5617014444860301, 'epoch': 9.97683155917345})

In [18]:
trainer.save_model('wav2vec2_final_model')

In [None]:
trainer.

<transformers.trainer.Trainer at 0x30250d950>