In [1]:
import pandas as pd
import numpy as np

import transformers
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer

import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader, IterableDataset
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Any, Union, Optional
import os
import re
import json
import random
from pprint import pprint

import unicodedata
from bnunicodenormalizer import Normalizer 

from pandarallel import pandarallel

import evaluate

from datasets import load_dataset, load_metric
from dataclasses import dataclass, field
import librosa
from IPython.display import display, Audio, HTML, Markdown

pandarallel.initialize(progress_bar=True,nb_workers=os.cpu_count())
tqdm.pandas()

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [7]:
from audioconverter import AudioConverter
from normalize import removeOptionalZW, removePunc, normalizeUnicode

In [None]:
from utils import *

In [None]:
seed_everything(42)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

# Config

In [2]:
class Config:
    early_stopping_patience = 3

    # Trainer arugments.
    trainer = TrainingArguments(
      output_dir="./run-003-wav2vec2-fulldata-cosine-lr3e-5_v7_5_3_2",
      group_by_length=False,
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      gradient_accumulation_steps=1,
      evaluation_strategy="steps",
      num_train_epochs=10,
      gradient_checkpointing=True,
      fp16=True,
      save_steps=1000,
      eval_steps=1000,
      logging_steps=1000,
      learning_rate=3e-5,
      dataloader_num_workers=os.cpu_count(),
      warmup_steps=300,
      save_total_limit=100,
      push_to_hub=False,
      run_name="run-003-wav2vec2-fulldata-cosine-lr3e-5_v7_5_3_2",
      load_best_model_at_end=True,
      lr_scheduler_type="cosine",
      resume_from_checkpoint=True,
      remove_unused_columns=False
    )

# Vocab

In [3]:
train = pd.read_csv("../data/train.csv")
train['path'] = 'train_mp3s/' + train['id'] + '.mp3'

In [4]:
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'
train['sentence_cleaned'] = train['sentence'].apply(lambda s: re.sub(chars_to_remove_regex, '', str(s)).lower())

In [5]:
def extract_all_chars(x):
    all_text = " ".join(x)
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [6]:
all_words = ' '.join(train['sentence'])
unique_characters = set(all_words)

In [7]:
vocab_dict = {v: k for k, v in enumerate(sorted(unique_characters))}

In [8]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [9]:
vocab_dict['<unk>'] = len(vocab_dict)
vocab_dict['<pad>'] = len(vocab_dict)
vocab_dict['<s>']= len(vocab_dict)
vocab_dict['</s>']= len(vocab_dict)

In [11]:
with open('../data/vocab/vocab_v1.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

# Dataset

In [13]:
class SprintDataset(Dataset):
        
    def __init__(self, df, processor, audioConverter, loopDataset=1):
        self.df = df
        self.paths = df['path']
        self.sentences = df['sentence']
        self.len = len(self.df) * loopDataset

        self.processor = processor
        self.ac = audioConverter

    def __len__(self):
        return self.len

    def loadSample(self, idx):
        idx %= len(self.df)
        audio_path = self.paths[idx]
        sentence = self.sentences[idx]

        wave = self.ac.getAudio(audio_path)[0]
        input_values = processor(wave, sampling_rate=16000).input_values[0]

        input_length = len(input_values)
        with self.processor.as_target_processor():
            labels = self.processor(sentence).input_ids

        return {
            'input_values':input_values,
            'input_length':input_length,
            'labels':labels
        }

    def __getitem__(self, idx): 
        if idx >= self.len:
            raise IndexError('index out of range')
        return self.loadSample(idx)

In [16]:
def _hf_dataset(df):
    paths = []
    audios = []
    sentences = []
    full_samples = df['id'].tolist()
    
    for i, row in df.iterrows():
        temp_path = f"train_mp3s/{row['id']}.mp3"
        paths.append(temp_path)
        audios.append(temp_path)
        sentences.append(row['sentence'])
        
    return {"path":paths,"audio":audios,"sentence":sentences}

In [20]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


# Training data, tokenizer and model

In [None]:
ac = AudioConverter(16000)

In [None]:
train_df = pd.read_csv("../data/train_v20.csv")
train_df['path'] = '../data/train_mp3s/' + train_df['id'] + '.mp3'
valid_df = pd.read_csv("../data/val_v20.csv")
valid_df['path'] = '../data/train_mp3s/' + valid_df['id'] + '.mp3'

train_df["sentence"] = [ normalizeUnicode(x) for x in tqdm(train_df["sentence"]) ]
valid_df["sentence"] = [ normalizeUnicode(x) for x in tqdm(valid_df["sentence"]) ]

In [21]:
tokenizer = Wav2Vec2CTCTokenizer(
    '../data/vocab/vocab_v1.json', #'my_tokenizer',
    unk_token='<unk>',
    pad_token='<pad>',
    word_delimiter_token='|', ##<todo>???
    bos_token='<s>',
    eos_token='</s>',
)

In [22]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [23]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='', vocab_size=91, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True)

In [34]:
data_collator = DataCollatorCTCWithPadding(processor=processor)

In [26]:
wer_metric = evaluate.load("wer")

In [27]:
train_dataset = SprintDataset(train_df, processor, ac)
test_dataset = SprintDataset(valid_df, processor, ac)

In [29]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [30]:
model = Wav2Vec2ForCTC.from_pretrained(
    'ai4bharat/indicwav2vec_v1_bengali',
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean", 
    ctc_zero_infinity=True, # dark magic to avoid nan 
    pad_token_id=processor.tokenizer.pad_token_id,
    diversity_loss_weight=100 # dark magic to avoid nan 
)

model.lm_head = torch.nn.Linear(1024, len(processor.tokenizer))
model.config.vocab_size = len(processor.tokenizer)

model.freeze_feature_encoder()

In [31]:
len(processor.tokenizer)

91

In [32]:
model.to('cuda:0')
print("")




# Training

In [35]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=Config.trainer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,   
    tokenizer=processor.feature_extractor,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=Config.early_stopping_patience)],
)

In [36]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbd317[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Wer
1000,6.016,2.003505,0.982469
2000,1.3999,0.734921,0.410085
3000,1.049,0.685827,0.379539
4000,0.9692,0.663313,0.369383
5000,0.9484,0.643196,0.362289
6000,0.9237,0.629195,0.358997
7000,0.9024,0.616806,0.353178
8000,0.9064,0.604386,0.354709
9000,0.8763,0.621475,0.354735
10000,0.8771,0.604756,0.350269




TrainOutput(global_step=35000, training_loss=1.0101252005440848, metrics={'train_runtime': 16368.2679, 'train_samples_per_second': 382.914, 'train_steps_per_second': 23.932, 'total_flos': 1.5104053049314617e+20, 'train_loss': 1.0101252005440848, 'epoch': 0.89})