In [1]:
import pandas as pd
import numpy as np

import transformers
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer

import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader, IterableDataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Any, Union, Optional
import os
import re
import json
import random
from pprint import pprint

import unicodedata
from bnunicodenormalizer import Normalizer 

from pandarallel import pandarallel

import evaluate

from datasets import load_dataset, load_metric
from dataclasses import dataclass, field
import librosa
from IPython.display import display, Audio, HTML, Markdown

pandarallel.initialize(progress_bar=True,nb_workers=os.cpu_count())
tqdm.pandas()

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from audioconverter import AudioConverter
from normalize import removeOptionalZW, removePunc, normalizeUnicode

25370427392

In [None]:
from utils import *

In [None]:
seed_everything(42)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

# Config

In [4]:
# Training config class.
class Config:        
    # Early stopping.
    early_stopping_patience = 3

    # Trainer arugments.
    trainer = TrainingArguments(
      output_dir="runs/1b_v32",
      group_by_length=False,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=8,
      gradient_accumulation_steps=1,
      evaluation_strategy="steps",
      num_train_epochs=10,
      gradient_checkpointing=True,
      fp16=True,
      save_steps=5000,
      eval_steps=5000,
      logging_steps=5000,
      learning_rate=3e-5,
      dataloader_num_workers=os.cpu_count(),
      warmup_steps=300,
      save_total_limit=100,
      push_to_hub=False,
      run_name="runs/1b_v32",
      load_best_model_at_end=True,
      lr_scheduler_type="cosine",
      resume_from_checkpoint=True,
      remove_unused_columns=False
    )

# Vocab & training data

In [11]:
train = pd.read_pickle("../data/train_full_v2.pkl")
train['path'] = "../data/" + train['path']

In [12]:
if not os.path.exists("../data/vocab/vocab_v2.json"):  
    chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'
    train['sentence'] = train['sentence'].apply(lambda s: re.sub(chars_to_remove_regex, '', str(s)).lower())
    
    all_words = ' '.join(train['sentence'])
    unique_characters = set(all_words)
    
    vocab_dict = {v: k for k, v in enumerate(sorted(unique_characters))}
    
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]
    
    vocab_dict['<unk>'] = len(vocab_dict)
    vocab_dict['<pad>'] = len(vocab_dict)
    vocab_dict['<s>']= len(vocab_dict)
    vocab_dict['</s>']= len(vocab_dict)
    
    import json
    with open('../data/vocab/vocab_v2.json', 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)

In [13]:
duration = pd.read_pickle("../data/train_with_duration.pkl")
duration = duration.loc[duration['duration'] <= 10.].reset_index(drop=True).copy()
duration['use_duration'] = 1

In [14]:
train = pd.merge(train, duration[['id','use_duration']], how='left')
train = train.loc[train['use_duration'] == 1].reset_index(drop=True)

In [19]:
class SprintDataset(Dataset):
        
    def __init__(self, df, processor, audioConverter, loopDataset=1):
        self.df = df
        self.paths = df['path']
        self.sentences = df['sentence']
        self.len = len(self.df) * loopDataset

        self.processor = processor
        self.ac = audioConverter

    def __len__(self):
        return self.len

    def loadSample(self, idx):
        idx %= len(self.df)
        audio_path = self.paths[idx]
        sentence = self.sentences[idx]

        wave = self.ac.getAudio(audio_path)[0]
        input_values = processor(wave, sampling_rate=16000).input_values[0]

        input_length = len(input_values)
        with self.processor.as_target_processor():
            labels = self.processor(sentence).input_ids

        return {
            'input_values':input_values,
            'input_length':input_length,
            'labels':labels
        }

    def __getitem__(self, idx): 
        if idx >= self.len:
            raise IndexError('index out of range')
        return self.loadSample(idx)

In [21]:
train_ac = AudioConverter(16000, disableAug=False)
test_ac = AudioConverter(16000, disableAug=True)

In [22]:
def _hf_dataset(df):
    paths = []
    audios = []
    sentences = []
    full_samples = df['id'].tolist()
    
    for i, row in df.iterrows():
        temp_path = f"train_mp3s/{row['id']}.mp3"
        paths.append(temp_path)
        #audios.append({"path":temp_path,
        #               "array":librosa.load(temp_path, sr=16000)[0],
        #               "sampling_rate":16000})
        audios.append(temp_path)
        sentences.append(row['sentence'])
        
    return {"path":paths,"audio":audios,"sentence":sentences}

In [23]:
train_df = train.loc[train['split'] == 'train'].reset_index(drop=True)
valid_df = train.loc[train['split'] == 'valid'].reset_index(drop=True)
valid_df.head(3)

Unnamed: 0,id,sentence,split,path,use,use_duration
0,0000e711c2b1,তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রত...,valid,/home/benedikt/deeplearning/bengaliai-speech/d...,1.0,1.0
1,00065e317123,তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাস...,valid,/home/benedikt/deeplearning/bengaliai-speech/d...,1.0,1.0
2,00065f40df52,তিনি বিজয়নগর সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুর...,valid,/home/benedikt/deeplearning/bengaliai-speech/d...,1.0,1.0


In [26]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )
        
        #batch = self.processor(
        #    input_values=[feature["input_values"] for feature in features],
        #    text=[feature["labels"] for feature in features],
        #    padding=self.padding,
        #    return_tensors="pt",
        #)


        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


# Training data, tokenizer and model

In [28]:
tokenizer = Wav2Vec2CTCTokenizer(
    '../data/vocab/vocab_v2.json', #'my_tokenizer',
    unk_token='<unk>',
    pad_token='<pad>',
    word_delimiter_token='|', ##<todo>???
    bos_token='<s>',
    eos_token='</s>',
)

In [29]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [30]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='', vocab_size=65, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

In [31]:
data_collator = DataCollatorCTCWithPadding(processor=processor)

In [32]:
len(tokenizer)

65

In [33]:
wer_metric = evaluate.load("wer")

In [34]:
train_dataset = SprintDataset(train_df, processor, train_ac)
test_dataset = SprintDataset(valid_df, processor, test_ac)

In [35]:
train_dataset[0]



{'input_values': array([0.00021892, 0.00021892, 0.00021892, ..., 0.00018963, 0.00021073,
        0.00023368], dtype=float32),
 'input_length': 75456,
 'labels': [5,
  39,
  49,
  0,
  30,
  56,
  39,
  48,
  41,
  0,
  15,
  44,
  58,
  25,
  25,
  48,
  0,
  37,
  51,
  23,
  21,
  49,
  0,
  15,
  49,
  34,
  58,
  30,
  51,
  0,
  11,
  25,
  48,
  0,
  45,
  26,
  49,
  15,
  0,
  35,
  31,
  0,
  34,
  48]}

In [36]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [37]:
model = Wav2Vec2ForCTC.from_pretrained(
    'facebook/wav2vec2-xls-r-1b',
    #'facebook/wav2vec2-large-xlsr-53',
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    ctc_zero_infinity=True, # dark magic to avoid nan 
    pad_token_id=processor.tokenizer.pad_token_id,
    diversity_loss_weight=100 # dark magic to avoid nan 
)

model.lm_head = torch.nn.Linear(1280, len(processor.tokenizer))
model.config.vocab_size = len(processor.tokenizer)

model.freeze_feature_encoder()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-1b and are newly initialized: ['lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
len(processor.tokenizer)

65

In [39]:
model.to('cuda:0')
print("")




# Training

In [40]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=Config.trainer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,   
    tokenizer=processor.feature_extractor,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=Config.early_stopping_patience)],
)

In [42]:
trainer.train() #log was resumed from checkpoint

[34m[1mwandb[0m: Currently logged in as: [33mbd317[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Wer
115000,0.2199,0.083017,0.117088
120000,0.2275,0.084399,0.116731
125000,0.2265,0.082726,0.119526




TrainOutput(global_step=125000, training_loss=0.026953169921875, metrics={'train_runtime': 14042.8916, 'train_samples_per_second': 266.026, 'train_steps_per_second': 16.627, 'total_flos': 1.6195656293103284e+21, 'train_loss': 0.026953169921875, 'epoch': 5.35})