In [None]:
# pip install librosa

In [None]:
# pip install soundfile

In [None]:
# pip install accelerate -U

In [None]:
# pip install wandb

In [None]:
# wandab login

In [87]:
from datasets import load_dataset
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from huggingface_hub import notebook_login
from enum import Enum
import random
import torch
from pydub import AudioSegment
import os

In [88]:
class Model(Enum):
    FacebookWav2Vec2 = 1
    AudioSpectrogramTransformer = 2

In [89]:
SEED = 1
SPLIT_SILENCE = False
SAVE_TRAIN_TEST = False
DATASET_PATH = 'train/audio'
LEARNING_RATE = 3e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_EVAL_BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 5
WARMUP_RATIO = 0.1
LOGGING_STEPS = 10
MODEL = Model.FacebookWav2Vec2
MODEL_NAMES = { Model.FacebookWav2Vec2: "Wav2Vec", Model.AudioSpectrogramTransformer: "AST" }
MODEL_NAME = MODEL_NAMES[MODEL]

In [90]:
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2ecb415b990>

In [91]:
k = 0

def split_audio(file_path, output_folder, k):
    audio = AudioSegment.from_file(file_path)
    length_ms = len(audio)

    for i in range(0, length_ms, 1000):
        end = i + 1000

        if end > length_ms:
            end = length_ms

        chunk = audio[i:end]
        chunk_name = f"{output_folder}/chunk_{k:03d}.wav"
        chunk.export(chunk_name, format="wav")

        k += 1

    print(f"Audio split into {length_ms//1000} chunks.")

    return k

if SPLIT_SILENCE:
    for file in os.listdir(f"{DATASET_PATH}/_background_noise_/"):
        if file.endswith(".wav"):
            k = split_audio(f"{DATASET_PATH}/_background_noise_/{file}", "{DATASET_PATH}/silence", k)

In [92]:
data = load_dataset("train/audio", name="en-US", split='train')
data = data.filter(lambda example: "_background_noise_" not in example["audio"]["path"])
data = data.train_test_split(test_size=0.2)
data

Resolving data files:   0%|          | 0/65129 [00:00<?, ?it/s]

Found cached dataset audiofolder (C:/Users/User/.cache/huggingface/datasets/audiofolder/audio-51c9dfcf18469dc6/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
Loading cached processed dataset at C:\Users\User\.cache\huggingface\datasets\audiofolder\audio-51c9dfcf18469dc6\0.0.0\6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc\cache-6ebeeea247822a5f.arrow


DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 52098
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 13025
    })
})

In [105]:
def save_file_paths(dataset, file_name):
    with open(file_name, 'w') as file:
        for example in dataset:
            file_path = example['audio']['path']
            relative_path = os.path.relpath(file_path, start=os.getcwd())
            file.write(relative_path.replace("\\", "/") + '\n')

if SAVE_TRAIN_TEST:
    save_file_paths(data['train'], 'train.txt')
    save_file_paths(data['test'], 'test.txt')

In [106]:
data["train"][0]

{'audio': {'path': 'c:\\Users\\User\\Documents\\Studia\\dlm-2\\train\\audio\\sheila\\e8b6f6fe_nohash_0.wav',
  'array': array([0.01608276, 0.02075195, 0.02236938, ..., 0.02816772, 0.02255249,
         0.0229187 ]),
  'sampling_rate': 16000},
 'label': 21}

In [107]:
labels = data["train"].features["label"].names
labels

['_background_noise_',
 'bed',
 'bird',
 'cat',
 'dog',
 'down',
 'eight',
 'five',
 'four',
 'go',
 'happy',
 'house',
 'left',
 'marvin',
 'nine',
 'no',
 'off',
 'on',
 'one',
 'right',
 'seven',
 'sheila',
 'silence',
 'six',
 'stop',
 'three',
 'tree',
 'two',
 'up',
 'wow',
 'yes',
 'zero']

In [108]:
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label[str(2)]

'bird'

In [109]:
model_name = "facebook/wav2vec2-base" if MODEL == Model.FacebookWav2Vec2 else "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_data = data.map(preprocess_function, remove_columns="audio", batched=True)



Map:   0%|          | 0/52098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13025 [00:00<?, ? examples/s]

In [110]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [111]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_name, num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data["train"].with_format("torch"),
    eval_dataset=encoded_data["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()