In [1]:
import pandas as pd
import numpy as np
import torch

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor

import torchaudio
import torchaudio.transforms as tat

from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

from typing import Dict, List, Tuple, Any, Union, Optional

import os
import re
import json
import random
from pprint import pprint

import unicodedata
from bnunicodenormalizer import Normalizer 

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm
import evaluate
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader, IterableDataset

import transformers
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer

from datasets import load_dataset, load_metric
from dataclasses import dataclass, field
import librosa
from transformers import Wav2Vec2CTCTokenizer
from IPython.display import display, Audio, HTML, Markdown

bnorm = Normalizer()
pandarallel.initialize(progress_bar=True,nb_workers=os.cpu_count())
tqdm.pandas()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
from audioconverter import AudioConverter
from normalize import *

In [None]:
from utils import *

In [None]:
seed_everything(33)

In [None]:
os.environ["WANDB_DISABLED"] = "true"

In [2]:
torch.cuda.get_device_name()

'NVIDIA RTX A6000'

# Config

In [3]:
class Config:
    early_stopping_patience = 3

    # Trainer arugments.
    trainer = TrainingArguments(
      output_dir="runs/wav2vec_indic_v34",
      group_by_length=False,
      per_device_train_batch_size=8,
      per_device_eval_batch_size=4,
      gradient_accumulation_steps=1,
      evaluation_strategy="steps",
      num_train_epochs=10,
      #gradient_checkpointing=True,
      fp16=True,
      save_steps=1000,
      eval_steps=1000,
      logging_steps=1000,
      learning_rate=1e-5,
      dataloader_num_workers=os.cpu_count(),
      warmup_steps=300,
      save_total_limit=100,
      push_to_hub=False,
      run_name="runs/wav2vec_indic_v34",
      load_best_model_at_end=True,
      lr_scheduler_type="cosine",
      resume_from_checkpoint=True,
      remove_unused_columns=False,
        max_grad_norm=1.0
    )

# Vocab

In [10]:
train = pd.read_pickle("../data/train_full_v2.pkl")
train['path'] = "../data/" + train['path']

In [13]:
duration = pd.read_pickle("../data/train_with_duration.pkl")
duration = duration.loc[duration['duration'] <= 10].reset_index(drop=True).copy()
duration['use_duration'] = 1
train = pd.merge(train, duration[['id','use_duration']], how='left')
train = train.loc[train['use_duration'] == 1]

In [15]:
class SprintDataset(Dataset):
        
    def __init__(self, df, processor, audioConverter, loopDataset=1):
        self.df = df
        self.paths = df['path']
        self.sentences = df['sentence']
        self.len = len(self.df) * loopDataset

        self.processor = processor
        self.ac = audioConverter

    def __len__(self):
        return self.len

    def loadSample(self, idx):
        idx %= len(self.df)
        audio_path = self.paths[idx]
        sentence = self.sentences[idx]

        wave = self.ac.getAudio(audio_path)[0]
        input_values = processor(wave, sampling_rate=16000).input_values[0]

        input_length = len(input_values)
        with self.processor.as_target_processor():
            labels = self.processor(sentence).input_ids

        return {
            'input_values':input_values,
            'input_length':input_length,
            'labels':labels
        }

    def __getitem__(self, idx): 
        if idx >= self.len:
            raise IndexError('index out of range')
        return self.loadSample(idx)

In [17]:
train_ac = AudioConverter(16000)
test_ac = AudioConverter(16000, disableAug=True)

In [18]:
def _hf_dataset(df):
    paths = []
    audios = []
    sentences = []
    full_samples = df['id'].tolist()
    
    for i, row in df.iterrows():
        temp_path = f"train_mp3s/{row['id']}.mp3"
        paths.append(temp_path)
        #audios.append({"path":temp_path,
        #               "array":librosa.load(temp_path, sr=16000)[0],
        #               "sampling_rate":16000})
        audios.append(temp_path)
        sentences.append(row['sentence'])
        
    return {"path":paths,"audio":audios,"sentence":sentences}

In [19]:
train_df = train.loc[train['split'] == 'train'].reset_index(drop=True)
valid_df = train.loc[train['split'] == 'valid'].reset_index(drop=True)
valid_df.head(3)

Unnamed: 0,id,sentence,split,path,use,use_duration
0,0000e711c2b1,তিনি এবং তাঁর মা তাদের পৈতৃক বাড়িতে থেকে প্রত...,valid,/notebooks/data/base_files/train_mp3s/0000e711...,1.0,1.0
1,00065e317123,তিনি তার সুশৃঙ্খল সামরিক বাহিনী এবং সুগঠিত শাস...,valid,/notebooks/data/base_files/train_mp3s/00065e31...,1.0,1.0
2,00065f40df52,তিনি বিজয়নগর সাম্রাজ্যের বিরুদ্ধে এবং বিজাপুর...,valid,/notebooks/data/base_files/train_mp3s/00065f40...,1.0,1.0


In [20]:
valid_df = valid_df.sample(frac=0.2, random_state=42).reset_index(drop=True)

In [23]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


# Training data, tokenizer and model

In [25]:
tokenizer = Wav2Vec2CTCTokenizer(
    '../data/vocab/vocab_v2.json',
    unk_token='<unk>',
    pad_token='<pad>',
    word_delimiter_token='|',
    bos_token='<s>',
    eos_token='</s>',
)

In [26]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [27]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='', vocab_size=65, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [28]:
data_collator = DataCollatorCTCWithPadding(processor=processor)

In [29]:
len(tokenizer)

65

In [30]:
wer_metric = evaluate.load("wer")

In [31]:
train_dataset = SprintDataset(train_df, processor, train_ac)
test_dataset = SprintDataset(valid_df, processor, test_ac)

In [32]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

# Training ensemble

In [36]:
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
from transformers.modeling_outputs import (
    BaseModelOutput,
    CausalLMOutput,
    MaskedLMOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
    Wav2Vec2BaseModelOutput,
    XVectorOutput,
)

from torch.utils.checkpoint import checkpoint_sequential

In [37]:
model1 = Wav2Vec2Model.from_pretrained("/notebooks/experiments/v35_210k")
print("")

Some weights of the model checkpoint at /notebooks/experiments/v35_210k were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [38]:
model2 = Wav2Vec2Model.from_pretrained("/notebooks/experiments/v32_130k")
print("")

Some weights of the model checkpoint at /notebooks/experiments/v32_130k were not used when initializing Wav2Vec2Model: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [39]:
debug = False

if debug:

    test_loader =  torch.utils.data.DataLoader(train_dataset,
                                 batch_size=1,
                                 shuffle=False)

    sample = next(iter(test_loader))
    
    res = model1(input_values=sample['input_values'], output_hidden_states=True)
    res2 = model2(input_values=sample['input_values'], output_hidden_states=True)

In [42]:
class CustomModel(nn.Module):
    def __init__(self, model1, model2):
        super().__init__()
        self.model1 = self.freeze_model(model1)
        self.model2 = self.freeze_model(model2)
        self.encoder_layers = nn.TransformerEncoderLayer(d_model=1024+1280, nhead=6)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=2)
        self.lm_head = nn.Linear(1024+1280, len(processor.tokenizer))
        self.config = self.model1.config

    def freeze_model(self, model):
        for param in model.parameters():
            param.requires_grad = False
        return model

    def forward(self, input_values, labels=None, **kwargs):

        with torch.no_grad():
            feature1 = self.model1(input_values=input_values, output_hidden_states=True).last_hidden_state
            feature2 = self.model2(input_values=input_values, output_hidden_states=True).last_hidden_state

        concatenated_features = torch.cat((feature1, feature2), dim=-1)
        
        encoded_features = self.transformer_encoder(concatenated_features)
        logits = self.lm_head(encoded_features)

        loss = None
        
        attention_mask = torch.ones_like(input_values, dtype=torch.long)
        input_lengths = self.model1._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)

        labels_mask = labels >= 0
        target_lengths = labels_mask.sum(-1)
        flattened_targets = labels.masked_select(labels_mask)

        log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

        loss = nn.functional.ctc_loss(
            log_probs,
            flattened_targets,
            input_lengths,
            target_lengths,
            blank=62,
            reduction='mean',
            zero_infinity=True,
        )

        return {'loss': loss, 'logits': logits}

In [44]:
new_model = CustomModel(model1, model2)



In [45]:
new_model.to('cuda:0')
print("")




In [46]:
trainer = Trainer(
    model=new_model,
    data_collator=data_collator,
    args=Config.trainer,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,   
    tokenizer=processor.feature_extractor,
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=Config.early_stopping_patience)],
)

Using cuda_amp half precision backend


In [47]:
trainer.train()

***** Running training *****
  Num examples = 373577
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 466980
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mbd317[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Wer
1000,2.0218,0.068197,0.100452
2000,0.1329,0.065643,0.096494
3000,0.1286,0.065213,0.099887
4000,0.1261,0.064076,0.095957
5000,0.1199,0.06397,0.09477
6000,0.1169,0.063112,0.092847
7000,0.1232,0.063515,0.093073
8000,0.1197,0.064042,0.095052
9000,0.1174,0.064434,0.094911


  check = torch.cuda.FloatTensor(1).fill_(0)
***** Running Evaluation *****
  Num examples = 3836
  Batch size = 4
Saving model checkpoint to runs/wav2vec_indic_v34/checkpoint-1000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Feature extractor saved in runs/wav2vec_indic_v34/checkpoint-1000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3836
  Batch size = 4
Saving model checkpoint to runs/wav2vec_indic_v34/checkpoint-2000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Feature extractor saved in runs/wav2vec_indic_v34/checkpoint-2000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3836
  Batch size = 4
Saving model checkpoint to runs/wav2vec_indic_v34/checkpoint-3000
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Feature extractor saved in runs/wav2vec_indic_v34/checkpoint-3000/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3836
  Batch siz

TrainOutput(global_step=9000, training_loss=0.3340475082397461, metrics={'train_runtime': 4117.221, 'train_samples_per_second': 907.352, 'train_steps_per_second': 113.421, 'total_flos': 0.0, 'train_loss': 0.3340475082397461, 'epoch': 0.19})