In [1]:
"""
OthmaneJ/distil-wav2vec2

"""

'\nOthmaneJ/distil-wav2vec2\n\n'

In [2]:
import random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os

import torch.nn.functional as F
import torch.optim as opt
import torchaudio
import torchaudio.transforms as T
import torch
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from datasets import Dataset


In [3]:
SEED = 1234
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(SEED)

DATA_PATH = Path("../data")
WEIGHTS_PATH = Path("OthmaneJ/distil-wav2vec2")
EXP_NAME = WEIGHTS_PATH.name

max_duration = 1.0  # seconds
MAX_AUDIO_LEN = 16000  # в отсчётах sr

batch_size = 8
DEVICE = "cuda"
N_EPOCHS = 10


In [4]:
from datasets import load_dataset, load_metric
dataset = load_dataset("superb", "ks")
metric = load_metric("accuracy")

Reusing dataset superb (/home/and/.cache/huggingface/datasets/superb/ks/1.9.0/fc1f59e1fa54262dfb42de99c326a806ef7de1263ece177b59359a1a3354a9c9)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(WEIGHTS_PATH)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [6]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
import random
from IPython.display import Audio, display

for _ in range(2):
    rand_idx = random.randint(0, len(dataset["train"])-1)
    example = dataset["train"][rand_idx]
    audio = example["audio"]

    print(f'Label: {id2label[str(example["label"])]}')
    print(f'Shape: {audio["array"].shape}, sampling rate: {audio["sampling_rate"]}')
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: _unknown_
Shape: (16000,), sampling rate: 16000



Label: _unknown_
Shape: (16000,), sampling rate: 16000





In [8]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True, 
    )
    return inputs

In [9]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio", "file"], batched=True)
encoded_dataset

Loading cached processed dataset at /home/and/.cache/huggingface/datasets/superb/ks/1.9.0/fc1f59e1fa54262dfb42de99c326a806ef7de1263ece177b59359a1a3354a9c9/cache-64ee3b5ff5a81430.arrow
Loading cached processed dataset at /home/and/.cache/huggingface/datasets/superb/ks/1.9.0/fc1f59e1fa54262dfb42de99c326a806ef7de1263ece177b59359a1a3354a9c9/cache-20f0bb61fb915262.arrow
Loading cached processed dataset at /home/and/.cache/huggingface/datasets/superb/ks/1.9.0/fc1f59e1fa54262dfb42de99c326a806ef7de1263ece177b59359a1a3354a9c9/cache-5dc33054dbcbad25.arrow


DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 51094
    })
    validation: Dataset({
        features: ['label', 'input_values'],
        num_rows: 6798
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 3081
    })
})

In [10]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

wav2vec = AutoModelForAudioClassification.from_pretrained(
    WEIGHTS_PATH, 
    num_labels=len(label2id),
    label2id=label2id,
    id2label=id2label,
)


Some weights of the model checkpoint at OthmaneJ/distil-wav2vec2 were not used when initializing Wav2Vec2ForSequenceClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at OthmaneJ/distil-wav2vec2 and are newly initialized: ['projector.bias', 'classifier.weight', 'classifier.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and i

In [11]:
args = TrainingArguments(
    f"pretrained_models/{EXP_NAME}-finetuned-ks",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [12]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [13]:
trainer = Trainer(
    wav2vec,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 51094
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 15960


Epoch,Training Loss,Validation Loss,Accuracy
0,1.2851,0.973176,0.709032
1,0.4239,0.258839,0.953663
2,0.3392,0.119113,0.973227
3,0.2983,0.115438,0.972492
4,0.173,0.109443,0.976464
5,0.1299,0.097782,0.976611
6,0.1644,0.106641,0.975875
7,0.169,0.108619,0.975287
8,0.1217,0.102527,0.978523
9,0.1368,0.103134,0.979553


***** Running Evaluation *****
  Num examples = 6798
  Batch size = 8
Saving model checkpoint to pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-1596
Configuration saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-1596/config.json
Model weights saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-1596/pytorch_model.bin
Configuration saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-1596/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 6798
  Batch size = 8
Saving model checkpoint to pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-3192
Configuration saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-3192/config.json
Model weights saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-3192/pytorch_model.bin
Configuration saved in pretrained_models/distil-wav2vec2-finetuned-ks/checkpoint-3192/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 6798
 

TrainOutput(global_step=15960, training_loss=0.3842006083157726, metrics={'train_runtime': 8211.7092, 'train_samples_per_second': 62.221, 'train_steps_per_second': 1.944, 'total_flos': 2.552681766230784e+18, 'train_loss': 0.3842006083157726, 'epoch': 10.0})