In [None]:
"""
OthmaneJ/distil-wav2vec2

"""

In [1]:
import random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os

import torch.nn.functional as F
import torch.optim as opt
import torchaudio
import torchaudio.transforms as T
import torch
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from datasets import Dataset


In [2]:
SEED = 1234
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(SEED)

DATA_PATH = Path("../data")
WEIGHTS_PATH = Path("OthmaneJ/distil-wav2vec2")
EXP_NAME = WEIGHTS_PATH.name

max_duration = 1.0  # seconds
MAX_AUDIO_LEN = 16000  # в отсчётах sr

batch_size = 64
DEVICE = "cuda"
N_EPOCHS = 5

CLASSES = [
    "down",
    "go",
    "left",
    "no",
    "off",
    "on",
    "right",
    "stop",
    "up",
    "yes",
]
label2id = dict(dict([[k, v] for k, v in enumerate(CLASSES)]))
id2label = dict(dict([[v, k] for k, v in enumerate(CLASSES)]))


In [3]:
from datasets import load_metric
metric = load_metric("accuracy")

In [4]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(WEIGHTS_PATH)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [5]:
def preprocess_function(audio_arrays):
    """
    {'input_values': [array([ 1.7378503,  1.5420032,  1.3440062, ..., -0.8847088, -1.2148459, -1.2481029], dtype=float32)]}
    """
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
        
    )
    inputs["input_values"] = torch.tensor(np.stack(inputs["input_values"]))
    return inputs

In [6]:
class TrainData(Dataset):
    def __init__(self, audio_filepaths: list, noise_dir: Path, aug=False) -> None:
        # super().__init__()
        self.audio_len = MAX_AUDIO_LEN
        self.aug = aug

        self.audios = list()
        self.noises = list()
        self.classes = list()

        for wav_path in audio_filepaths:
            audio, _ = torchaudio.load(wav_path)
            self.audios.append(audio)
            self.classes.append(wav_path.parts[-2])

        if aug:
            for wav_path in noise_dir.iterdir():
                if wav_path.is_dir():
                    continue
                noise, _ = torchaudio.load(wav_path)
                self.noises.append(noise)

    def __len__(self):
        return len(self.classes)

    def __getitem__(self, idx):
        if self.aug:
            noise_idx = np.random.randint(0, len(self.noises))
            noise = self.noises[noise_idx]

            if noise.shape[0] == 2:
                noise = noise[np.random.randint(0, 2)] # TODO: try mean instead random
                noise = noise.unsqueeze(0)

        audio = self.audios[idx]
        audio = self.__pad_audio(audio)
        if self.aug:
            audio = self.__add_noise(audio, noise, 0, 6)

        inputs = preprocess_function(audio[0])

        # audio = audio / audio.abs().max()

        return inputs, CLASSES.index(self.classes[idx])

    def __pad_audio(self, audio):
        if self.audio_len - audio.shape[-1] > 0:
            i = np.random.randint(0, self.audio_len - audio.shape[-1])
        else:
            i = 0
        pad_patern = (i, self.audio_len - audio.shape[-1] - i)
        audio = F.pad(audio, pad_patern, "constant").detach()
        return audio

    def __add_noise(self, clean, noise, min_amp, max_amp):
        noise_amp = np.random.uniform(min_amp, max_amp)
        # так как шумная запись длиннее, то выбираем случайный момент начала шумной записи
        start = np.random.randint(0, noise.shape[1] - clean.shape[1] + 1)
        noise_part = noise[:, start : start + clean.shape[1]]

        if noise_part.abs().max() == 1:
            return clean

        # накладываем шум
        noise_mult = clean.abs().max() / noise_part.abs().max() * noise_amp
        return (clean + noise_part * noise_mult) / (1 + noise_amp)


class TestData(Dataset):
    def __init__(self, audio_filepaths: list) -> None:
        super().__init__()
        self.audio_len = MAX_AUDIO_LEN

        self.audios = list()
        for file_name in audio_filepaths:
            audio, _ = torchaudio.load(file_name)
            self.audios.append(audio)

    def __len__(self):
        return len(self.audios)

    def __getitem__(self, idx):
        audio = self.audios[idx]
        audio = self.__pad_audio(audio)

        inputs = preprocess_function(audio.numpy())
        # audio = audio / audio.abs().max()

        return inputs

    def __pad_audio(self, audio):
        if self.audio_len - audio.shape[-1] > 0:
            i = np.random.randint(0, self.audio_len - audio.shape[-1])
        else:
            i = 0
        pad_patern = (i, self.audio_len - audio.shape[-1] - i)
        audio = F.pad(audio, pad_patern, "constant").detach()

        return audio

In [7]:
# def collate_fn(data):
#     """
#        data: is a list of tuples with (example, label)
#              where 'example' is a tensor of arbitrary shape
#              and label are scalar
#     """
#     return {
#         "input_values": np.array([d[0]["input_values"][0] for d in data]),
#         "label": np.array([d[1] for d in data])
#     }

In [8]:
# Split Data
audio_filpaths = sorted(list((DATA_PATH / "train").rglob("*.wav")))
classes = [p.parts[-2] for p in audio_filpaths]
train_audio_paths, val_audio_paths, train_classes, val_classes = \
    train_test_split(audio_filpaths, classes, test_size=0.2, random_state=SEED)

In [9]:
train_dataset = TrainData(train_audio_paths, noise_dir=DATA_PATH / "noises", aug=True)
val_dataset = TrainData(val_audio_paths, noise_dir=DATA_PATH / "noises", aug=False)

In [10]:
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [11]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

wav2vec = AutoModelForAudioClassification.from_pretrained(
    WEIGHTS_PATH, 
    num_labels=len(CLASSES),
    label2id=label2id,
    id2label=id2label,
)


Some weights of the model checkpoint at OthmaneJ/distil-wav2vec2 were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at OthmaneJ/distil-wav2vec2 and are newly initialized: ['projector.weight', 'projector.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [12]:
inputs, _ = train_dataset[0]
inputs["input_values"] = inputs["input_values"].to(DEVICE)
wav2vec.to(DEVICE)
print(inputs)
print(inputs['input_values'].shape)
wav2vec.eval()
with torch.no_grad():
    pred = wav2vec(**inputs)
pred

{'input_values': tensor([[-1.8141, -1.7691, -1.6758,  ..., -0.8816, -0.8477, -0.7326]],
       device='cuda:0')}
torch.Size([1, 16000])


SequenceClassifierOutput(loss=None, logits=tensor([[ 0.0141,  0.0085, -0.0117, -0.0131,  0.0430, -0.0060,  0.0144, -0.0412,
         -0.0164, -0.0096]], device='cuda:0'), hidden_states=None, attentions=None)

In [16]:
batch

{'input_values': tensor([[ 0.2070,  0.0518, -0.0618,  ...,  0.4190,  0.4657,  0.3676],
         [-1.3232, -0.9358,  1.0901,  ...,  0.1622, -0.8691, -0.7778],
         [ 0.3843,  0.4121,  0.4399,  ...,  0.6081,  0.6880,  0.8491],
         ...,
         [ 1.7361, -0.1130, -0.0816,  ...,  0.2289,  0.7946, -0.4290],
         [ 1.6754,  1.4497,  1.3224,  ..., -2.7199, -2.4121, -2.3129],
         [-1.1061, -1.1019, -1.0525,  ...,  1.1632,  1.1945,  1.1776]],
        device='cuda:0')}

In [17]:
wav2vec(**batch)

SequenceClassifierOutput(loss=None, logits=tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan

In [13]:
wav2vec = wav2vec.to(DEVICE)

In [14]:
lr = 1e-5

param_optimizer = list(wav2vec.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters_vit = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# Зададим оптимизатор
optimizer = torch.optim.AdamW(optimizer_grouped_parameters_vit, lr) # ЗДЕСЬ ВАШ КОД

criterion = torch.nn.CrossEntropyLoss()

In [15]:
for epoch in range(N_EPOCHS):
    
    wav2vec.train()
        
    train_loss = []
    train_predictions = []
    train_targets = []
    for batch, targets in tqdm(train_dl, desc=f"Epoch: {epoch}"):

        batch["input_values"] = batch["input_values"].to(DEVICE)
        batch["input_values"] = batch["input_values"].reshape(
            (batch["input_values"].shape[0], batch["input_values"].shape[-1])
        )

        targets = targets.to(DEVICE)
        
        logits = wav2vec(**batch).logits
        print(logits)
        loss = criterion(logits, targets) 
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())

        predictions = logits.argmax(axis=1)
        
        train_predictions.extend(predictions.cpu().detach().numpy())
        train_targets.extend(targets.cpu().detach().numpy())
        
    train_predictions = np.array(train_predictions).astype(int)

    print('Training loss:', np.mean(train_loss))
    print('Train Accuracy:', accuracy_score(train_targets, train_predictions))
    
    wav2vec.eval()
        
    val_predictions = []
    val_targets = []
    for batch, targets in tqdm(val_dl, desc=f"Epoch: {epoch}"):
        
        with torch.no_grad():
            for i in batch:
                batch[i] = batch[i][:, 0].to(DEVICE)
            targets = targets.to(DEVICE)

            logits = wav2vec(**batch).logits
            
            predictions = logits.argmax(axis=1)            
            val_predictions.extend(predictions.cpu().numpy())
            val_targets.extend(targets.cpu().numpy())
        
    val_predictions = np.array(val_predictions).astype(int)

    print('Val Accuracy:', accuracy_score(val_targets, val_predictions))

    wav2vec.save_pretrained('pretrained_models/wav2vec_finetuned')

Epoch: 0:   0%|          | 0/1110 [00:00<?, ?it/s]

tensor([[ 1.6551e-02,  1.2008e-02, -7.6334e-03, -1.8614e-02,  4.3402e-02,
         -8.7597e-03,  1.2948e-02, -4.6222e-02, -6.7940e-03, -1.0908e-02],
        [ 1.8752e-02,  1.0632e-02, -1.1477e-03, -1.1894e-02,  4.0279e-02,
         -6.3752e-03,  1.1366e-02, -4.8121e-02, -9.9872e-03, -1.3281e-02],
        [ 1.7441e-02,  7.2031e-03, -1.6650e-03, -1.0155e-02,  4.2349e-02,
         -1.5398e-02,  6.9229e-03, -4.9575e-02, -4.2170e-03, -1.1349e-02],
        [ 1.7478e-02,  1.3717e-02, -4.4010e-03, -7.9082e-03,  3.6825e-02,
         -6.1113e-03,  9.7633e-03, -4.6259e-02, -1.2524e-02, -8.3350e-03],
        [ 1.9698e-02,  1.7497e-02, -6.4424e-03, -5.4242e-03,  4.5362e-02,
         -9.6035e-03,  4.6848e-03, -4.5016e-02, -1.8917e-02, -8.0932e-03],
        [ 5.4673e-03,  9.3220e-03, -6.7102e-03, -1.3638e-02,  3.1403e-02,
         -4.2224e-03,  1.5218e-02, -4.3534e-02, -1.6065e-02, -5.0242e-03],
        [ 2.0009e-02,  1.9996e-02, -9.3919e-03, -1.0440e-02,  4.9953e-02,
         -7.0457e-03,  1.1474e-0

KeyboardInterrupt: 

In [183]:
args = TrainingArguments(
    f"pretrained_models/{EXP_NAME}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [184]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [185]:
trainer = Trainer(
    wav2vec,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [186]:
trainer.train()

AttributeError: 'TrainData' object has no attribute '_data'