In [1]:
# pip install librosa

In [2]:
# pip install soundfile

In [3]:
# pip install accelerate -U

In [4]:
# pip install wandb

In [None]:
# pip install git+https://github.com/jimbozhang/hf_transformers_custom_model_ced.git

In [5]:
# wandab login

In [20]:
from datasets import load_dataset, DatasetDict
from transformers import AutoFeatureExtractor
from ced_model.feature_extraction_ced import CedFeatureExtractor
from ced_model.modeling_ced import CedForAudioClassification
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from huggingface_hub import notebook_login
from enum import Enum
import random
import torch
from pydub import AudioSegment
import os

In [7]:
class Model(Enum):
    FacebookWav2Vec2 = 1
    AST = 2

In [8]:
SEED = 1
SPLIT_SILENCE = False
DATASET_PATH = 'train/audio'
LEARNING_RATE = 3e-5
PER_DEVICE_TRAIN_BATCH_SIZE = 32
GRADIENT_ACCUMULATION_STEPS = 4
PER_DEVICE_EVAL_BATCH_SIZE = 32
NUM_TRAIN_EPOCHS = 5
WARMUP_RATIO = 0.1
LOGGING_STEPS = 10
MODEL = Model.AST
MODEL_NAMES = { Model.FacebookWav2Vec2: "Wav2Vec", Model.AST: "AST" }
MODEL_NAME = MODEL_NAMES[MODEL]

In [9]:
random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x1fd37f2bab0>

In [10]:
k = 0

def split_audio(file_path, output_folder, k):
    audio = AudioSegment.from_file(file_path)
    length_ms = len(audio)

    for i in range(0, length_ms, 1000):
        end = i + 1000

        if end > length_ms:
            end = length_ms

        chunk = audio[i:end]
        chunk_name = f"{output_folder}/chunk_{k:03d}.wav"
        chunk.export(chunk_name, format="wav")

        k += 1

    print(f"Audio split into {length_ms//1000} chunks.")

    return k

if SPLIT_SILENCE:
    for file in os.listdir(f"{DATASET_PATH}/_background_noise_/"):
        if file.endswith(".wav"):
            k = split_audio(f"{DATASET_PATH}/_background_noise_/{file}", "{DATASET_PATH}/silence", k)

In [11]:
data = load_dataset("train/audio", name="en-US", split='train')
data = data.train_test_split(test_size=0.2, seed=SEED)

Resolving data files:   0%|          | 0/65123 [00:00<?, ?it/s]

Downloading and preparing dataset audiofolder/audio to C:/Users/User/.cache/huggingface/datasets/audiofolder/audio-e208182fc5c155b5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc...


Downloading data files:   0%|          | 0/65123 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset audiofolder downloaded and prepared to C:/Users/User/.cache/huggingface/datasets/audiofolder/audio-e208182fc5c155b5/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc. Subsequent calls will reuse this data.


In [12]:
data["train"][0]

{'audio': {'path': 'c:\\Users\\User\\Documents\\Studia\\dlm-2\\train\\audio\\marvin\\3a789a0d_nohash_1.wav',
  'array': array([-0.02600098, -0.02432251, -0.02545166, ..., -0.02835083,
         -0.0284729 , -0.02923584]),
  'sampling_rate': 16000},
 'label': 12}

In [13]:
data['test'][0]

{'audio': {'path': 'c:\\Users\\User\\Documents\\Studia\\dlm-2\\train\\audio\\yes\\ec74a8a5_nohash_1.wav',
  'array': array([-9.15527344e-05, -9.15527344e-05, -6.10351562e-05, ...,
         -5.79833984e-04, -2.44140625e-04, -3.66210938e-04]),
  'sampling_rate': 16000},
 'label': 29}

In [14]:
labels = data["train"].features["label"].names
labels

['bed',
 'bird',
 'cat',
 'dog',
 'down',
 'eight',
 'five',
 'four',
 'go',
 'happy',
 'house',
 'left',
 'marvin',
 'nine',
 'no',
 'off',
 'on',
 'one',
 'right',
 'seven',
 'sheila',
 'silence',
 'six',
 'stop',
 'three',
 'tree',
 'two',
 'up',
 'wow',
 'yes',
 'zero']

In [15]:
label2id, id2label = dict(), dict()

for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

id2label[str(2)]

'cat'

In [25]:
model_name = "facebook/wav2vec2-base" if MODEL == Model.FacebookWav2Vec2 else "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = CedFeatureExtractor.from_pretrained(model_name)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    return inputs

data = data.map(preprocess_function, remove_columns="audio", batched=True)

NameError: name 'FeatureExtractor' is not defined

In [22]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

In [23]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [26]:
num_labels = len(id2label)
model = CedForAudioClassification.from_pretrained(
    model_name, num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    warmup_ratio=WARMUP_RATIO,
    logging_steps=LOGGING_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"].with_format("torch"),
    eval_dataset=data["test"].with_format("torch"),
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


  0%|          | 0/2035 [00:00<?, ?it/s]

{'loss': 0.6947, 'learning_rate': 1.4705882352941177e-06, 'epoch': 0.02}
{'loss': 0.6946, 'learning_rate': 2.9411764705882355e-06, 'epoch': 0.05}
{'loss': 0.6945, 'learning_rate': 4.411764705882353e-06, 'epoch': 0.07}
{'loss': 0.6945, 'learning_rate': 5.882352941176471e-06, 'epoch': 0.1}
{'loss': 0.6943, 'learning_rate': 7.3529411764705884e-06, 'epoch': 0.12}
{'loss': 0.6942, 'learning_rate': 8.823529411764707e-06, 'epoch': 0.15}
{'loss': 0.694, 'learning_rate': 1.0294117647058824e-05, 'epoch': 0.17}
{'loss': 0.6938, 'learning_rate': 1.1764705882352942e-05, 'epoch': 0.2}
{'loss': 0.6937, 'learning_rate': 1.323529411764706e-05, 'epoch': 0.22}
{'loss': 0.6935, 'learning_rate': 1.4705882352941177e-05, 'epoch': 0.25}
{'loss': 0.6934, 'learning_rate': 1.6176470588235293e-05, 'epoch': 0.27}
{'loss': 0.6933, 'learning_rate': 1.7647058823529414e-05, 'epoch': 0.29}
{'loss': 0.6933, 'learning_rate': 1.9117647058823528e-05, 'epoch': 0.32}
{'loss': 0.6933, 'learning_rate': 2.058823529411765e-05, '

KeyboardInterrupt: 