In [None]:
# pip install librosa

In [None]:
# pip install soundfile

In [None]:
# pip install accelerate -U

In [None]:
# pip install wandb

In [None]:
# wandb login

In [None]:
# huggingface-cli login

In [None]:
from datasets import load_dataset
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from enum import Enum
import random
import torch
from pydub import AudioSegment
import os

In [None]:
class Model(Enum):
    FacebookWav2Vec2 = 1
    HUBERT = 2

In [None]:
SEED = 1
SPLIT_SILENCE = False
DATASET_PATH = 'train/audio'
LEARNING_RATE = 3e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_EVAL_BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 5
WARMUP_RATIO = 0.1
LOGGING_STEPS = 10
MODEL = Model.HUBERT
MODEL_NAMES = { Model.FacebookWav2Vec2: "Wav2Vec", Model.HUBERT: "HUBERT" }
MODEL_NAME = MODEL_NAMES[MODEL]

In [None]:
random.seed(SEED)
torch.manual_seed(SEED)

# Split background noise into smaller silence samples

In [None]:
k = 0

def split_audio(file_path, output_folder, k):
    audio = AudioSegment.from_file(file_path)
    length_ms = len(audio)

    for i in range(0, length_ms, 1000):
        end = i + 1000

        if end > length_ms:
            end = length_ms

        chunk = audio[i:end]
        chunk_name = f"{output_folder}/chunk_{k:03d}.wav"
        chunk.export(chunk_name, format="wav")

        k += 1

    print(f"Audio split into {length_ms//1000} chunks.")

    return k

if SPLIT_SILENCE:
    for file in os.listdir(f"{DATASET_PATH}/_background_noise_/"):
        if file.endswith(".wav"):
            k = split_audio(f"{DATASET_PATH}/_background_noise_/{file}", f"{DATASET_PATH}/silence", k)

# Split dataset into train/test in 80-20 proportion

In [None]:
data = load_dataset(DATASET_PATH, split='train')
data = data.train_test_split(test_size=0.2, seed=SEED)

In [None]:
data["train"][0]

In [None]:
data['test'][0]

In [None]:
labels = data["train"].features["label"].names
labels

In [None]:
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label[str(2)]

# Extract features from audio

In [None]:
model_name = "facebook/wav2vec2-base" if MODEL == Model.FacebookWav2Vec2 else "facebook/hubert-base-ls960"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

data = data.map(preprocess_function, remove_columns="audio", batched=True)

# Fine tune the model

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_name, num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"].with_format("torch"),
    eval_dataset=data["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()