In [52]:
from pathlib import Path
import torch
import torch.nn.functional as F
import torchaudio
from torch.utils.data import DataLoader, Dataset
from speechbrain.pretrained import EncoderClassifier

from sklearn.model_selection import train_test_split
from datasets import load_metric

import numpy as np
import pandas as pd

In [53]:
SEED = 1234
DATA_PATH = Path("../data")
WEIGHTS_PATH = Path("speechbrain/google_speech_command_xvector")
EXP_NAME = WEIGHTS_PATH.name

MAX_AUDIO_LEN = 16000  # в отсчётах sr
DEVICE = "cuda"
BATCH_SIZE = 128
N_EPOCHS = 100
VAL_ITER = 10
CLASSES = [
    "down",
    "go",
    "left",
    "no",
    "off",
    "on",
    "right",
    "stop",
    "up",
    "yes",
]
label2id = dict(dict([[k, v] for k, v in enumerate(CLASSES)]))
id2label = dict(dict([[v, k] for k, v in enumerate(CLASSES)]))

In [54]:
enc_classifier = EncoderClassifier.from_hparams(
    source=WEIGHTS_PATH,
    savedir=Path("pretrained_models") / EXP_NAME,
    run_opts={"device": DEVICE},
)

In [55]:
enc_classifier.mods.keys()

odict_keys(['compute_features', 'mean_var_norm', 'embedding_model', 'classifier'])

In [4]:
audio_normalizer = enc_classifier.audio_normalizer
feature_extractor1 = enc_classifier.mods.compute_features
normalizer = enc_classifier.mods.mean_var_norm
embedding_model = enc_classifier.mods.embedding_model

In [40]:
class TrainData(Dataset):
    def __init__(self, audio_filepaths: list, noise_dir: Path, aug=False) -> None:
        super().__init__()
        self.audio_len = MAX_AUDIO_LEN
        self.aug = aug

        self.audios = list()
        self.noises = list()
        self.classes = list()

        for wav_path in audio_filepaths:
            audio, _ = torchaudio.load(wav_path)
            self.audios.append(audio)
            self.classes.append(wav_path.parts[-2])

        if aug:
            for wav_path in noise_dir.iterdir():
                noise, _ = torchaudio.load(wav_path)
                self.noises.append(noise)

    def __len__(self):
        return len(self.classes)

    def __getitem__(self, idx):
        if self.aug:
            noise_idx = np.random.randint(0, len(self.noises))
            noise = self.noises[noise_idx]

            if noise.shape[0] == 2:
                noise = noise[np.random.randint(0, 2)] # TODO: try mean instead random
                noise = noise.unsqueeze(0)

        audio = self.audios[idx]
        audio = self.__pad_audio(audio)
        if self.aug:
            audio = self.__add_noise(audio, noise, 0, 6)
        audio = audio / audio.abs().max()

        return audio, CLASSES.index(self.classes[idx])

    def __pad_audio(self, audio):
        if self.audio_len - audio.shape[-1] > 0:
            i = np.random.randint(0, self.audio_len - audio.shape[-1])
        else:
            i = 0
        pad_patern = (i, self.audio_len - audio.shape[-1] - i)
        audio = F.pad(audio, pad_patern, "constant").detach()
        return audio

    def __add_noise(self, clean, noise, min_amp, max_amp):
        noise_amp = np.random.uniform(min_amp, max_amp)
        # так как шумная запись длиннее, то выбираем случайный момент начала шумной записи
        start = np.random.randint(0, noise.shape[1] - clean.shape[1] + 1)
        noise_part = noise[:, start : start + clean.shape[1]]

        if noise_part.abs().max() == 1:
            return clean

        # накладываем шум
        noise_mult = clean.abs().max() / noise_part.abs().max() * noise_amp
        return (clean + noise_part * noise_mult) / (1 + noise_amp)


class TestData(Dataset):
    def __init__(self, audio_filepaths: list) -> None:
        super().__init__()
        self.audio_len = MAX_AUDIO_LEN

        self.audios = list()
        for file_name in audio_filepaths:
            audio, _ = torchaudio.load(file_name)
            self.audios.append(audio)

    def __len__(self):
        return len(self.audios)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        audio = self.__pad_audio(audio)
        audio = audio / audio.abs().max()

        return audio

    def __pad_audio(self, audio):
        if self.audio_len - audio.shape[-1] > 0:
            i = np.random.randint(0, self.audio_len - audio.shape[-1])
        else:
            i = 0
        pad_patern = (i, self.audio_len - audio.shape[-1] - i)
        audio = F.pad(audio, pad_patern, "constant").detach()

        return audio

In [41]:
# Split Data
audio_filpaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))
classes = [p.parts[-2] for p in audio_filpaths]
train_audio_paths, val_audio_paths, train_classes, val_classes = \
    train_test_split(audio_filpaths, classes, test_size=0.2, random_state=SEED)

In [42]:
train_dataset = TrainData(train_audio_paths, noise_dir=DATA_PATH / "noises", aug=True)
val_dataset = TrainData(val_audio_paths, noise_dir=DATA_PATH / "noises", aug=False)

In [43]:
len(train_dataset), len(val_dataset)

(71032, 17758)

In [51]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

model = AutoModelForAudioClassification.from_pretrained(
    WEIGHTS_PATH, 
    num_labels=len(CLASSES),
    label2id=label2id,
    id2label=id2label,
)
# model = embedding_model

loading configuration file https://huggingface.co/speechbrain/google_speech_command_xvector/resolve/main/config.json from cache at /home/and/.cache/huggingface/transformers/189eff8c2b250ceb71be715e2335959234fc8bdbb1b504c508dda42f3d6ea844.b7fc0832e158a106fdd233d8ba1e7b54eb928fdb08345ba2e6510001573ad668


ValueError: Unrecognized model in speechbrain/google_speech_command_xvector. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: yoso, swin, vilt, vit_mae, realm, nystromformer, imagegpt, qdqbert, vision-encoder-decoder, trocr, fnet, segformer, vision-text-dual-encoder, perceiver, gptj, layoutlmv2, beit, rembert, visual_bert, canine, roformer, clip, bigbird_pegasus, deit, luke, detr, gpt_neo, big_bird, speech_to_text_2, speech_to_text, vit, wav2vec2, m2m_100, convbert, led, blenderbot-small, retribert, ibert, mt5, t5, mobilebert, distilbert, albert, bert-generation, camembert, xlm-roberta, pegasus, marian, mbart, megatron-bert, mpnet, bart, blenderbot, reformer, longformer, roberta, deberta-v2, deberta, flaubert, fsmt, squeezebert, hubert, bert, openai-gpt, gpt2, transfo-xl, xlnet, xlm-prophetnet, prophetnet, xlm, ctrl, electra, speech-encoder-decoder, encoder-decoder, funnel, lxmert, dpr, layoutlm, rag, tapas, splinter, sew-d, sew, unispeech-sat, unispeech, wavlm

In [45]:
args = TrainingArguments(
    f"{EXP_NAME}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [47]:
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [48]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [49]:
trainer.train()

***** Running training *****
  Num examples = 71032
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 4
  Total optimization steps = 1380


TypeError: vars() argument must have __dict__ attribute