In [None]:
# pip install librosa

In [None]:
# pip install soundfile

In [None]:
# pip install accelerate -U

In [None]:
# pip install wandb

In [None]:
# wandab login

In [None]:
from datasets import load_dataset, Audio
from transformers import AutoFeatureExtractor
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from huggingface_hub import notebook_login
from enum import Enum
import random

In [None]:
class Model(Enum):
    FacebookWav2Vec2 = 1
    AudioSpectrogramTransformer = 2

In [None]:
SEED = 1
LEARNING_RATE = 3e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_EVAL_BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 10
WARMUP_RATIO = 0.1
LOGGING_STEPS = 10
MODEL = Model.FacebookWav2Vec2

In [None]:
random.seed(SEED)

In [None]:
data = load_dataset("train/audio", name="en-US")
data = data.train_test_split(test_size=0.2)
data

In [None]:
data["train"][0]

In [None]:
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label[str(2)]

In [None]:
data = data.cast_column("audio", Audio(sampling_rate=16_000))
data["train"][0]

In [None]:
model_name = "facebook/wav2vec2-base" if MODEL == Model.FacebookWav2Vec2 else "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

encoded_data = data.map(preprocess_function, remove_columns="audio", batched=True)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
notebook_login()

In [None]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    model_name, num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=model_name,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data["train"].with_format("torch"),
    eval_dataset=encoded_data["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()