# Finetuning Hugging Face Wav2Vec2 model on LibriSpeech dataset 

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd drive/MyDrive

/content/drive/MyDrive


In [3]:
!pip install huggingface_hub
!pip install datasets>=1.18.3
!pip install transformers==4.11.3
!pip install librosa
!pip install jiwer

Collecting huggingface_hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[?25l[K     |█████                           | 10 kB 18.7 MB/s eta 0:00:01[K     |█████████▉                      | 20 kB 10.7 MB/s eta 0:00:01[K     |██████████████▊                 | 30 kB 8.2 MB/s eta 0:00:01[K     |███████████████████▋            | 40 kB 3.4 MB/s eta 0:00:01[K     |████████████████████████▌       | 51 kB 3.4 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 61 kB 4.0 MB/s eta 0:00:01[K     |████████████████████████████████| 67 kB 2.4 MB/s 
Installing collected packages: huggingface-hub
Successfully installed huggingface-hub-0.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Collecting transformers==4.11.3
  Downloading transforme

In [4]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 2s (874 kB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 155335 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.3.4-1_amd64.deb ...
Unpacking git-lfs (2.3.4-1) ...
Setting up git-lfs (2.3.4-1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [5]:
!git config --global credential.helper store

## Init HuggingFace hub

In [6]:
from huggingface_hub import notebook_login

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

In [7]:
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [8]:
repo_name = "wav2vec2-finetuning-model"

## Load data

In [9]:
from datasets import load_dataset

In [10]:
dataset = load_dataset("timit_asr")

Downloading:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading and preparing dataset timit_asr/clean (download: 828.75 MiB, generated: 7.90 MiB, post-processed: Unknown size, total: 836.65 MiB) to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad...


Downloading:   0%|          | 0.00/869M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset timit_asr downloaded and prepared to /root/.cache/huggingface/datasets/timit_asr/clean/2.0.1/b11b576ddcccbcefa7c9f0c4e6c2a43756f3033adffe0fb686aa61043d0450ad. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
dataset = dataset.remove_columns(['phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'])
dataset

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text'],
        num_rows: 1680
    })
})

## Vocab

In [12]:
import string
import json
import numpy as np

In [None]:
vocab = {w: idx for idx, w in enumerate(string.ascii_lowercase)}

In [None]:
vocab.update({
    "|": len(vocab), 
    "'": len(vocab) + 1, 
    "<UNK>": len(vocab) + 2, 
    "<PAD>": len(vocab) + 3
})

In [None]:
vocab

{"'": 27,
 '<PAD>': 29,
 '<UNK>': 28,
 'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25,
 '|': 26}

In [None]:
with open(r'vocab.json', 'w') as vocab_file:
    json.dump(vocab, vocab_file)

## Text tokenizer

In [14]:
from transformers import Wav2Vec2CTCTokenizer

In [None]:
tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", unk_token="<UNK>", pad_token="<PAD>", word_delimiter_token="|"
)

In [None]:
tokenizer.push_to_hub(repo_name)

In [15]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    'timkakhanovich/wav2vec2-finetuning-model', from_pt=True
)

Downloading:   0%|          | 0.00/268 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Audio Wav2Vec2 processing

In [16]:
from transformers import (
    Wav2Vec2FeatureExtractor, 
    Wav2Vec2Processor
)

In [17]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, 
    do_normalize=True, return_attntion_mask=False
)

In [18]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

## Preprocessing dataset

In [19]:
def data_preprocessing(batch):
    sample = batch['audio']

    batch['input_values'] = processor(sample['array'], sampling_rate=sample['sampling_rate']).input_values[0]
    
    with processor.as_target_processor():
        batch['labels'] = processor(batch['text']).input_ids
    
    return sample

In [20]:
dataset_preprocessed = dataset.map(data_preprocessing, remove_columns=dataset.column_names['train'], 
                                   num_proc=4)

## Data Collator

In [21]:
import torch

from dataclasses import dataclass, field
from typing import Optional, Union

In [22]:
@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, samples):
        input_features = [{'input_values': s['input_values']} for s in samples]
        label_features = [{'input_ids': s['labels']} for s in samples]

        batch = self.processor.pad(
            input_features, 
            padding=self.padding, 
            max_length=self.max_length, 
            pad_to_multiple_of=self.pad_to_multiple_of, 
            return_tensors="pt"
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features, 
                padding=self.padding, 
                max_length=self.max_length_labels, 
                pad_to_multiple_of=self.pad_to_multiple_of_labels, 
                return_tensors="pt"
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch['input_ids'].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch['labels'] = labels
        
        return batch

In [23]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

## Metrics

In [24]:
from datasets import load_metric

In [25]:
wer_metric = load_metric('wer')

Downloading:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [26]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    
    return {'wer': wer}

## Model

In [27]:
from transformers import Wav2Vec2ForCTC

In [28]:
model = Wav2Vec2ForCTC.from_pretrained(
    'timkakhanovich/finetuned-asr', 
    ctc_loss_reduction='mean', 
    pad_token_id=processor.tokenizer.pad_token_id
)

Downloading:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/360M [00:00<?, ?B/s]

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    'facebook/wav2vec2-base', 
    ctc_loss_reduction='mean', 
    pad_token_id=processor.tokenizer.pad_token_id
)

Downloading:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForCTC: ['project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_hid.bias', 'quantizer.codevectors', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

In [None]:
model.freeze_feature_extractor()

## Init arguments

In [None]:
from transformers import TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir=repo_name, 
    group_by_length=True, 
    per_device_train_batch_size=32, 
    evaluation_strategy='steps', 
    num_train_epochs=10, 
    fp16=True, 
    gradient_checkpointing=True, 
    save_steps=500, 
    eval_steps=500, 
    logging_steps=500, 
    learning_rate=1e-4, 
    weight_decay=5e-3, 
    warmup_steps=1000, 
    save_total_limit=2
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## Training...

In [None]:
from transformers import Trainer

In [None]:
trainer = Trainer(
    model=model, 
    data_collator=data_collator, 
    args=training_args, 
    compute_metrics=compute_metrics, 
    train_dataset=dataset_preprocessed['train'], 
    eval_dataset=dataset_preprocessed['test'], 
    tokenizer=processor.feature_extractor
)

Using amp fp16 backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: sampling_rate, array, path.
***** Running training *****
  Num examples = 4620
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1450
  return (input_length - kernel_size) // stride + 1


Step,Training Loss,Validation Loss,Wer
500,3.1175,1.01893,0.717316
1000,0.5352,0.444786,0.402259


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: sampling_rate, array, path.
***** Running Evaluation *****
  Num examples = 1680
  Batch size = 8
Saving model checkpoint to wav2vec2-finetuning-model/checkpoint-500
Configuration saved in wav2vec2-finetuning-model/checkpoint-500/config.json
Model weights saved in wav2vec2-finetuning-model/checkpoint-500/pytorch_model.bin
Configuration saved in wav2vec2-finetuning-model/checkpoint-500/preprocessor_config.json
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: sampling_rate, array, path.
***** Running Evaluation *****
  Num examples = 1680
  Batch size = 8
Saving model checkpoint to wav2vec2-finetuning-model/checkpoint-1000
Configuration saved in wav2vec2-finetuning-model/checkpoint-1000/config.json
Model weights saved in wav2

TrainOutput(global_step=1450, training_loss=1.3240619370033002, metrics={'train_runtime': 9201.3375, 'train_samples_per_second': 5.021, 'train_steps_per_second': 0.158, 'total_flos': 1.3288091020869658e+18, 'train_loss': 1.3240619370033002, 'epoch': 10.0})

In [None]:
model_name = "finetuned-asr"

In [None]:
model.push_to_hub(model_name)

Cloning https://huggingface.co/timkakhanovich/finetuned-asr into local empty directory.
Configuration saved in finetuned-asr/config.json
Model weights saved in finetuned-asr/pytorch_model.bin


Upload file pytorch_model.bin:   0%|          | 3.38k/360M [00:00<?, ?B/s]

To https://huggingface.co/timkakhanovich/finetuned-asr
   92d6c6d..73d64f6  main -> main



'https://huggingface.co/timkakhanovich/finetuned-asr/commit/73d64f6e2504c7b4eea8d8545cf9808e632d6dbc'

In [29]:
def inference_model(batch):
    with torch.inference_mode():
        input_values = torch.tensor(batch['input_values'], device='cuda').unsqueeze(0)
        logits = model(input_values).logits
    
    batch['logits'] = logits
    pred_ids = torch.argmax(logits, dim=-1)
    batch['pred_str'] = processor.batch_decode(pred_ids)[0]
    batch['text'] = processor.decode(batch['labels'], group_tokens=False)

    return batch

In [30]:
model = model.to(torch.device('cuda'))

In [31]:
results = dataset_preprocessed['test'].map(inference_model, remove_columns=dataset_preprocessed['test'].column_names)

0ex [00:00, ?ex/s]

In [32]:
results['pred_str'][2], results['text'][2]

('<UNK>re you looking for employment<UNK>',
 '<UNK>re you looking for employment<UNK>')

In [40]:
l = torch.tensor(results['logits'][1])

In [41]:
l.shape

torch.Size([1, 140, 32])

In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.252
