<a href="https://colab.research.google.com/github/xinyueli2896/RapRec/blob/main/main_Wav2Vec2Base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Fine-tuning Speech Model with 🤗 Transformers**

In [None]:
import librosa
import torch
import transformers
model_checkpoint = "facebook/wav2vec2-base"
batch_size = 32

  from .autonotebook import tqdm as notebook_tqdm


## Prepare Data, Tokenizer, Feature Extractor

### Create Wav2Vec2CTCTokenizer

In [None]:
from datasets import load_dataset, load_metric
dataset_path = "dataset_cleaned"
timit = load_dataset("audiofolder", data_dir=dataset_path)

Resolving data files: 100%|██████████| 12533/12533 [00:00<00:00, 54369.68it/s]
Resolving data files: 100%|██████████| 3086/3086 [00:00<00:00, 46279.62it/s]


In [None]:
timit

DatasetDict({
    train: Dataset({
        features: ['audio', 'lyrics'],
        num_rows: 12532
    })
    test: Dataset({
        features: ['audio', 'lyrics'],
        num_rows: 3085
    })
})

In [None]:
# timit = timit.remove_columns(["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id"])

In [None]:
print(timit['train'][0])
type(timit['train'])

{'audio': {'path': 'E:/rap_rec/dataset_cleaned/train/chunk_0.mp3', 'array': array([-0.1511101 , -0.20103808, -0.18242885, ..., -0.03254318,
       -0.0436329 , -0.03384233]), 'sampling_rate': 44100}, 'lyrics': 'Wait wait a minute'}


datasets.arrow_dataset.Dataset

In [None]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [None]:
show_random_elements(timit["train"].remove_columns(["audio"]), num_examples=10)

Unnamed: 0,lyrics
0,Translation I will probably kill us both
1,Keepin' it short don't gotta give it a thought
2,Waist on thinner
3,If I throw up this money
4,I been living in the ghetto
5,Ayy welcome to the party
6,So now we stuck wit' the crazy pushin' a lotta negative up in the babies
7,A new clip
8,Gloves off bitch I'm feelin' impolite get lost
9,We gon' shoot that


In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\*\$\%\"]'

def remove_special_characters(batch):
    from unidecode import unidecode
    batch["lyrics"] = unidecode(batch["lyrics"])
    batch["lyrics"] = re.sub(chars_to_ignore_regex, '', batch["lyrics"]).lower() + " "
    # print(type(batch['lyrics']), batch['lyrics'])
    trans = (
        ('1', 'one'),
        ('2', 'two'),
        ('3', 'three'),
        ('4', 'four'),
        ('5', 'five'),
        ('6', 'six'),
        ('7', 'seven'),
        ('8', 'eight'),
        ('9', 'nine'),
        ('0', 'zero'),
    )

    for num, word in trans:
        batch["lyrics"] = batch["lyrics"].replace(num, word)

    return batch

In [None]:
timit = timit.map(remove_special_characters)

In [None]:
show_random_elements(timit["train"].remove_columns(["audio"]))

Unnamed: 0,lyrics
0,yeah yeah yeah
1,cookie woo
2,right now like right now like
3,see they said that i wouldn't
4,diamonds dancin' on your neck nigga tap in
5,i never gave a how nigga feel
6,and you know we out here every day with it
7,i don't do this shit to entertain entertain
8,ooh
9,finally tired of all of this


In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["lyrics"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocabs = timit.map(
  extract_all_chars,
  batched=True,
  batch_size=-1,
  keep_in_memory=True,
  remove_columns=timit.column_names["train"]
)

Map: 100%|██████████| 12532/12532 [00:00<00:00, 124761.50 examples/s]
Map: 100%|██████████| 3085/3085 [00:00<00:00, 29430.13 examples/s]


In [None]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'p': 0,
 ' ': 1,
 'y': 2,
 't': 3,
 'f': 4,
 '(': 5,
 'v': 6,
 '/': 7,
 's': 8,
 'h': 9,
 'r': 10,
 'g': 11,
 'o': 12,
 'x': 13,
 'e': 14,
 'j': 15,
 'q': 16,
 "'": 17,
 'i': 18,
 'd': 19,
 'n': 20,
 'l': 21,
 'u': 22,
 'z': 23,
 'a': 24,
 'w': 25,
 'c': 26,
 'b': 27,
 'm': 28,
 'k': 29}

In [None]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

32

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer_type = config.model_type if config.tokenizer_class is None else None
config = config if config.tokenizer_class is not None else None



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
  "./",
  config=config,
  tokenizer_type=tokenizer_type,
  unk_token="[UNK]",
  pad_token="[PAD]",
  word_delimiter_token="|",
)

`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.


In [None]:
model_checkpoint_name = model_checkpoint.split("/")[-1]

### Preprocess Data

In [None]:
# timit["train"][0]["file"]

In [None]:
timit["train"][0]["audio"]

{'path': 'E:/rap_rec/dataset_cleaned/train/chunk_0.mp3',
 'array': array([-0.1511101 , -0.20103808, -0.18242885, ..., -0.03254318,
        -0.0436329 , -0.03384233]),
 'sampling_rate': 44100}

In [None]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(timit["train"]))

print(timit["train"][rand_int]["lyrics"])
ipd.Audio(data=np.asarray(timit["train"][rand_int]["audio"]["array"]), autoplay=True, rate=44100)

brace for impact 


In [None]:
import librosa
_data = librosa.resample(timit["train"][rand_int]["audio"]["array"], orig_sr=44100, target_sr=16000)
ipd.Audio(data=_data, autoplay=True, rate=16000)

In [None]:
rand_int = random.randint(0, len(timit["train"]))

print("Target text:", timit["train"][rand_int]["lyrics"])
print("Input array shape:", np.asarray(timit["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", timit["train"][rand_int]["audio"]["sampling_rate"])

Target text: i shine my wrist 
Input array shape: (40131,)
Sampling rate: 44100


In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
def prepare_dataset(batch, processor=processor):
    import librosa
    audio = batch["audio"]
    wave = librosa.resample(audio['array'], orig_sr=44100, target_sr=16000)

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(wave, sampling_rate=16000).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["lyrics"]).input_ids
    return batch

In [None]:
# dir(timit)
timit.column_names

{'train': ['audio', 'lyrics'], 'test': ['audio', 'lyrics']}

In [None]:
timit = timit.map(prepare_dataset, remove_columns=["audio", "lyrics"], num_proc=4)

Map (num_proc=4): 100%|██████████| 12532/12532 [00:15<00:00, 792.33 examples/s] 
Map (num_proc=4): 100%|██████████| 3085/3085 [00:07<00:00, 408.41 examples/s] 


In [None]:
max_input_length_in_sec = 4.0
min_input_length_in_sec = 1.0
timit["train"] = timit["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])
timit["train"] = timit["train"].filter(lambda x: x >= min_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])
timit["test"] = timit["test"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])
timit["test"] = timit["test"].filter(lambda x: x >= min_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Filter: 100%|██████████| 12532/12532 [00:00<00:00, 563002.27 examples/s]
Filter: 100%|██████████| 12204/12204 [00:00<00:00, 312292.79 examples/s]
Filter: 100%|██████████| 3085/3085 [00:00<?, ? examples/s]
Filter: 100%|██████████| 2918/2918 [00:00<00:00, 182203.58 examples/s]


## Training

The data is processed so that we are ready to start setting up the training pipeline. We will make use of 🤗's [Trainer](https://huggingface.co/transformers/master/main_classes/trainer.html?highlight=trainer) for which we essentially need to do the following:

### Set-up Trainer

Let's start by defining the data collator. The code for the data collator was copied from [this example](https://github.com/huggingface/transformers/blob/9a06b6b11bdfc42eea08fa91d0c737d1863c99e3/examples/research_projects/wav2vec2/run_asr.py#L81).

In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

  wer_metric = load_metric("wer")


In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    lyrics_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=lyrics_str)

    return {"wer": wer}

In [None]:
from transformers import AutoModelForCTC

model = AutoModelForCTC.from_pretrained(
    model_checkpoint,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.bias', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir='output',
  group_by_length=True,
  per_device_train_batch_size=32,
  evaluation_strategy="steps",
  # num_train_epochs=30,
  num_train_epochs=20,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  # learning_rate=1e-5,
  weight_decay=0.001,
  warmup_steps=1000,
  save_total_limit=2,
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit["train"],
    eval_dataset=timit["test"],
    tokenizer=processor.feature_extractor,
)

### Training

In [None]:
import warnings

warnings.filterwarnings(action='ignore', module='torch')
warnings.filterwarnings(action='ignore', module='transformers')
trainer.train()

  8%|▊         | 500/6520 [03:22<39:13,  2.56it/s] 

{'loss': 3.7857, 'learning_rate': 4.9500000000000004e-05, 'epoch': 1.53}


                                                  
  8%|▊         | 500/6520 [03:58<39:13,  2.56it/s]

{'eval_loss': 3.0108823776245117, 'eval_wer': 1.0, 'eval_runtime': 36.111, 'eval_samples_per_second': 66.351, 'eval_steps_per_second': 8.308, 'epoch': 1.53}


 15%|█▌        | 1000/6520 [07:18<39:04,  2.35it/s]  

{'loss': 2.8254, 'learning_rate': 9.95e-05, 'epoch': 3.07}


                                                   
 15%|█▌        | 1000/6520 [07:52<39:04,  2.35it/s]

{'eval_loss': 2.7002673149108887, 'eval_wer': 1.023023023023023, 'eval_runtime': 34.1938, 'eval_samples_per_second': 70.071, 'eval_steps_per_second': 8.774, 'epoch': 3.07}


 23%|██▎       | 1500/6520 [11:11<23:20,  3.59it/s]   

{'loss': 2.2868, 'learning_rate': 9.103260869565218e-05, 'epoch': 4.6}


                                                   
 23%|██▎       | 1500/6520 [11:48<23:20,  3.59it/s]

{'eval_loss': 2.2584116458892822, 'eval_wer': 0.9467888941573153, 'eval_runtime': 36.9613, 'eval_samples_per_second': 64.825, 'eval_steps_per_second': 8.117, 'epoch': 4.6}


 31%|███       | 2000/6520 [15:09<22:07,  3.40it/s]   

{'loss': 1.9579, 'learning_rate': 8.197463768115942e-05, 'epoch': 6.13}


                                                   
 31%|███       | 2000/6520 [15:48<22:07,  3.40it/s]

{'eval_loss': 2.0172243118286133, 'eval_wer': 0.9113850692798061, 'eval_runtime': 38.5561, 'eval_samples_per_second': 62.143, 'eval_steps_per_second': 7.781, 'epoch': 6.13}


 38%|███▊      | 2500/6520 [19:11<30:07,  2.22it/s]   

{'loss': 1.7305, 'learning_rate': 7.295289855072465e-05, 'epoch': 7.67}


                                                   
 38%|███▊      | 2500/6520 [19:51<30:07,  2.22it/s]

{'eval_loss': 2.106318473815918, 'eval_wer': 0.8753490332437701, 'eval_runtime': 39.6834, 'eval_samples_per_second': 60.378, 'eval_steps_per_second': 7.56, 'epoch': 7.67}


 46%|████▌     | 3000/6520 [23:13<26:55,  2.18it/s]   

{'loss': 1.5786, 'learning_rate': 6.389492753623188e-05, 'epoch': 9.2}


                                                   
 46%|████▌     | 3000/6520 [23:49<26:55,  2.18it/s]

{'eval_loss': 2.130814790725708, 'eval_wer': 0.8561719614351193, 'eval_runtime': 36.1015, 'eval_samples_per_second': 66.368, 'eval_steps_per_second': 8.31, 'epoch': 9.2}


 54%|█████▎    | 3500/6520 [27:10<16:02,  3.14it/s]   

{'loss': 1.4707, 'learning_rate': 5.4836956521739136e-05, 'epoch': 10.74}


                                                   
 54%|█████▎    | 3500/6520 [27:47<16:02,  3.14it/s]

{'eval_loss': 2.0220561027526855, 'eval_wer': 0.8407881565776303, 'eval_runtime': 37.4568, 'eval_samples_per_second': 63.967, 'eval_steps_per_second': 8.009, 'epoch': 10.74}


 61%|██████▏   | 4000/6520 [31:09<14:00,  3.00it/s]  

{'loss': 1.3545, 'learning_rate': 4.577898550724638e-05, 'epoch': 12.27}


                                                   
 61%|██████▏   | 4000/6520 [32:00<14:00,  3.00it/s]

{'eval_loss': 1.942319631576538, 'eval_wer': 0.8516937990622201, 'eval_runtime': 50.5346, 'eval_samples_per_second': 47.413, 'eval_steps_per_second': 5.937, 'epoch': 12.27}


 69%|██████▉   | 4500/6520 [37:04<16:42,  2.02it/s]   

{'loss': 1.2841, 'learning_rate': 3.6721014492753626e-05, 'epoch': 13.8}


                                                   
 69%|██████▉   | 4500/6520 [37:40<16:42,  2.02it/s]

{'eval_loss': 2.0597755908966064, 'eval_wer': 0.8244033507191402, 'eval_runtime': 35.8141, 'eval_samples_per_second': 66.901, 'eval_steps_per_second': 8.377, 'epoch': 13.8}


 77%|███████▋  | 5000/6520 [42:27<12:48,  1.98it/s]  

{'loss': 1.2076, 'learning_rate': 2.7681159420289854e-05, 'epoch': 15.34}


                                                   
 77%|███████▋  | 5000/6520 [43:07<12:48,  1.98it/s]

{'eval_loss': 2.0824501514434814, 'eval_wer': 0.8182392919235024, 'eval_runtime': 39.9186, 'eval_samples_per_second': 60.022, 'eval_steps_per_second': 7.515, 'epoch': 15.34}


 84%|████████▍ | 5500/6520 [52:35<18:24,  1.08s/it]  

{'loss': 1.1459, 'learning_rate': 1.8623188405797102e-05, 'epoch': 16.87}


                                                   
 84%|████████▍ | 5500/6520 [54:22<18:24,  1.08s/it]

{'eval_loss': 2.039092540740967, 'eval_wer': 0.8339918866234656, 'eval_runtime': 106.5217, 'eval_samples_per_second': 22.493, 'eval_steps_per_second': 2.816, 'epoch': 16.87}


 92%|█████████▏| 6000/6520 [1:04:40<09:31,  1.10s/it]

{'loss': 1.0938, 'learning_rate': 9.583333333333334e-06, 'epoch': 18.4}


                                                     
 92%|█████████▏| 6000/6520 [1:06:03<09:31,  1.10s/it]

{'eval_loss': 2.1278722286224365, 'eval_wer': 0.8278805120910384, 'eval_runtime': 82.4913, 'eval_samples_per_second': 29.046, 'eval_steps_per_second': 3.637, 'epoch': 18.4}


100%|█████████▉| 6500/6520 [1:15:12<00:12,  1.57it/s]  

{'loss': 1.0706, 'learning_rate': 5.253623188405797e-07, 'epoch': 19.94}


                                                     
100%|█████████▉| 6500/6520 [1:15:44<00:12,  1.57it/s]

{'eval_loss': 2.0967183113098145, 'eval_wer': 0.8177124492913966, 'eval_runtime': 32.0515, 'eval_samples_per_second': 74.755, 'eval_steps_per_second': 9.36, 'epoch': 19.94}


100%|██████████| 6520/6520 [1:15:57<00:00,  1.43it/s]

{'train_runtime': 4557.132, 'train_samples_per_second': 45.682, 'train_steps_per_second': 1.431, 'train_loss': 1.751242885999153, 'epoch': 20.0}





TrainOutput(global_step=6520, training_loss=1.751242885999153, metrics={'train_runtime': 4557.132, 'train_samples_per_second': 45.682, 'train_steps_per_second': 1.431, 'train_loss': 1.751242885999153, 'epoch': 20.0})