In [1]:
# pip install librosa

In [2]:
# pip install soundfile

In [3]:
# pip install accelerate -U

In [4]:
# pip install wandb

In [5]:
from datasets import load_dataset, Audio

In [6]:
data = load_dataset("train/train/audio", name="en-US", split="train")

Resolving data files:   0%|          | 0/64727 [00:00<?, ?it/s]

Found cached dataset audiofolder (C:/Users/User/.cache/huggingface/datasets/audiofolder/audio-683520afa982ca98/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


In [7]:
data = data.train_test_split(test_size=0.2)

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 51781
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 12946
    })
})

In [9]:
data["train"][0]

{'audio': {'path': 'c:\\Users\\User\\Documents\\Studia\\dlm-2\\train\\train\\audio\\marvin\\e48a80ed_nohash_0.wav',
  'array': array([-0.00036621, -0.00143433, -0.0020752 , ...,  0.00415039,
          0.00299072,  0.00195312]),
  'sampling_rate': 16000},
 'label': 13}

In [10]:
labels = data["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [11]:
id2label[str(2)]

'bird'

In [12]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [13]:
data = data.cast_column("audio", Audio(sampling_rate=16_000))
data["train"][0]

{'audio': {'path': 'c:\\Users\\User\\Documents\\Studia\\dlm-2\\train\\train\\audio\\marvin\\e48a80ed_nohash_0.wav',
  'array': array([-0.00036621, -0.00143433, -0.0020752 , ...,  0.00415039,
          0.00299072,  0.00195312]),
  'sampling_rate': 16000},
 'label': 13}

In [14]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

In [15]:
encoded_data = data.map(preprocess_function, remove_columns="audio", batched=True)

Map:   0%|          | 0/51781 [00:00<?, ? examples/s]

Map:   0%|          | 0/12946 [00:00<?, ? examples/s]

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

In [17]:
import numpy as np

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [18]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'classifier.weight', 'classifier.bias', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
# wandab login (w cmd)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [22]:
training_args = TrainingArguments(
    output_dir="audio_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_data["train"].with_format("torch"),
    eval_dataset=encoded_data["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/4040 [00:00<?, ?it/s]

{'loss': 3.4336, 'learning_rate': 7.425742574257426e-07, 'epoch': 0.02}
{'loss': 3.4317, 'learning_rate': 1.4851485148514852e-06, 'epoch': 0.05}
{'loss': 3.4321, 'learning_rate': 2.227722772277228e-06, 'epoch': 0.07}
{'loss': 3.4298, 'learning_rate': 2.9702970297029703e-06, 'epoch': 0.1}


KeyboardInterrupt: 