# Training - ran on google colab

In [None]:
# Check pytorch versions and verify environment setup
import torch

print(torch.__version__)
print(torch.cuda.is_available()) 

# verify versions
import accelerate
import transformers

transformers.__version__, accelerate.__version__

In [None]:
# List of imports for this notebook
from datasets import load_dataset
from datasets import Dataset
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC, TrainingArguments, Trainer

import pandas as pd
import numpy as np
import librosa
import io
import json
import re
import evaluate

In [5]:
# Load training dataset from file
import pickle

with open('../data/train_data.pkl', 'rb') as f:
    data = pickle.load(f)

In [6]:
# convert audio from bytes into a waveform
def bytes_to_waveform(data):
    waveforms = []

    for index, row in data.iterrows():
        audio_bytes = row["audio"]["bytes"]
        audio_io = io.BytesIO(audio_bytes)
        y, sr = librosa.load(audio_io, sr=None)

        # Resample the audio data
        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=16000)

        waveforms.append(y_resampled)
    
    data["waveform"] = waveforms
    return data

data = bytes_to_waveform(data)

In [7]:
# load testing dataset
test_dataset = load_dataset("Jzuluaga/uwb_atcc", split=f"test[:10%]")
print(test_dataset)

test_data = test_dataset.to_pandas()


Dataset({
    features: ['id', 'audio', 'text', 'segment_start_time', 'segment_end_time', 'duration'],
    num_rows: 282
})


In [None]:
# transform data and resample audio files
test_data = bytes_to_waveform(test_data)

# drop columns
test_data.drop(columns=["id", "audio", "segment_start_time", "segment_end_time", "duration"], inplace=True)

# make it back into a hugging face dataset
test_dataset = Dataset.from_pandas(test_data)

In [None]:
# training dataset
dataset = Dataset.from_pandas(data)
dataset = dataset.remove_columns(["id", "audio", "segment_start_time", "segment_end_time", "duration"])

In [None]:
# remove special characters from both training and testing datasets
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_chars(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

dataset = dataset.map(remove_special_chars)
test_dataset = test_dataset.map(remove_special_chars)

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]

In [None]:
# Create vocab list for the dataset (by batch)
def extract_chars(batch):
    vocab_list = []
    all_text_list = []
    
    for text in batch["text"]:
        all_text_list.append(text)
        vocab_list.append(list(set(text)))  # unique characters in the text
    
    return {"vocab": vocab_list, "all_text": all_text_list}

vocabs = dataset.map(extract_chars, batched=True, batch_size=-1, keep_in_memory=True)
vocab_list = list(set(vocabs["vocab"][0]) | set(vocabs["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
# add unk and padding to the vocab dict
# replace " " (spaces) with |
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

# save vocab json
with open('../data/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
# use vocab dict to create tokenizer, feature extractor and processor

tokenizer = Wav2Vec2CTCTokenizer("../data/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
# Verify input array
print("Input array shape:", np.asarray(dataset[10]["waveform"]).shape)

Input array shape: (46560,)


In [None]:
def preprocess(batch): 
    try:
        audio = batch["waveform"]
        batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
            
        with processor.as_target_processor():
            batch["labels"] = processor(batch["text"]).input_ids
    except Exception as e:
        print(f"Error processing batch: {batch}")
        print(f"Exception: {str(e)}")
        return None
    return batch

In [None]:
# leave as single threaded for this environment
dataset = dataset.map(preprocess, num_proc=1)

Map:   0%|          | 0/1129 [00:00<?, ? examples/s]



: 

In [None]:
# https://huggingface.co/blog/fine-tune-wav2vec2-english
# set up trainer 
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
# load pretrained wav2vec2 base model

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.freeze_feature_extractor()



In [None]:
wer_metric = evaluate.load("wer")

In [None]:
# https://huggingface.co/blog/fine-tune-wav2vec2-english

def compute_metrics(pred):
    # Decode model predictions
    pred_logits = pred.predictions
    pred_ids = pred_logits.argmax(-1)
    pred_texts = processor.batch_decode(pred_ids)
    
    # Decode true labels
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id  # Handle ignored tokens
    label_texts = processor.batch_decode(label_ids, group_tokens=False)
    
    # Compute WER
    wer = wer_metric.compute(predictions=pred_texts, references=label_texts)
    return {"wer": wer}

In [None]:
# set training arguments for model 

training_args = TrainingArguments(
    output_dir="./wav2vec2",
    evaluation_strategy="steps",
    per_device_train_batch_size=16,
    num_train_epochs=5,
    save_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    save_total_limit=1,
    fp16=True,
)



In [None]:
# start model training
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
)

trainer.train()

In [None]:
# store the trainer variable to use in the evautation notebook
%store trainer
%store test_dataset