In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

In [None]:
%%capture
!pip install datasets
!pip install transformers
!pip install torchaudio
!pip install jiwer

## Prepare Data, Tokenizer, Feature Extractor

Common Voice has many different splits including `invalidated`, which refers to data that was not rated as "clean enough" to be considered useful. In this notebook, we will only make use of the splits `"train"`, `"validation"` and `"test"`. 

In [None]:
from datasets import load_dataset, load_metric, Audio

In [None]:
import re

In [None]:
import pandas as pd
train_df=pd.read_csv('../input/dlsprint/train.csv')
parent_dir='../input/train-wavs-all-dl-sprint/train_wavs/'
wav_path=[parent_dir+x.split('.')[0]+'.wav' for x in list(train_df['path'])]
train_df['wav_path']=wav_path

In [None]:
valid_df=pd.read_csv('../input/dlsprint/validation.csv')
parent_dir='../input/dl-sprint-validation-16k/validation_files_wav/'
wav_path=[parent_dir+x.split('.')[0]+'.wav' for x in list(valid_df['path'])]
valid_df['wav_path']=wav_path

In [None]:
train_df

In [None]:
from scipy.io import wavfile
def wav_read(filename):
    #print(filename)
    sampling_rate, wave = wavfile.read(filename)
    
    speech_array = np.float32(wave) / (2**15 - 1)
    return speech_array,sampling_rate

Vocabulary disctionary is kept exactly same as the pretrained model

In [None]:
vocab_dict={"<s>": 1, "<pad>": 0, "</s>": 2, "<unk>": 3, "ই": 4, "3": 5, "হ": 6, "…": 7, "ল": 8, "্": 9, "ৈ": 10, "ো": 11, "৪": 12, "ধ": 13, "উ": 14, "া": 15, "ঞ": 16, "F": 17, "অ": 18, "ও": 19, "ট": 20, "খ": 21, "ড়": 22, "স": 23, "০": 24, "ম": 25, "ং": 26, "ৌ": 27, "গ": 28, "ঃ": 29, "‌": 30, "থ": 31, "e": 32, "ি": 33, "ষ": 34, "৯": 35, "়": 36, "চ": 37, "শ": 38, "ৗ": 39, "ঊ": 40, "৬": 41, "ঈ": 42, "ঋ": 43, "ঠ": 44, "ত": 45, "এ": 46, "৫": 47, "আ": 48, "ছ": 49, "ূ": 50, "ব": 51, "ঐ": 52, "প": 53, "ী": 54, "ড": 55, "৭": 56, "ণ": 57, "ফ": 58, "ু": 59, "ৃ": 60, "১": 61, "|": 62, "৮": 63, "‍": 64, "i": 65, "ৰ": 66, "ঔ": 67, "ভ": 68, "‎": 69, "ঙ": 70, "ৎ": 71, "ঘ": 72, "দ": 73, "২": 74, "ঝ": 75, "l": 76, "য়": 77, "জ": 78, "ক": 79, "ন": 80, "য": 81, "ে": 82, "র": 83, "৩": 84, "ঢ": 85, "ঁ": 86}

In [None]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./")


In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from torch.utils.data import Dataset
class dldata(Dataset):
    
    def __init__(self,df):
        self.df = df
        self.all_paths=df['wav_path']
        self.sen=df['sentence']
        
    def __getitem__(self,i):
        if i>= len(self.df):
            raise IndexError('Index out of range')
        
        
        aud_path=self.all_paths[i]
        aud_arr,_=wav_read(aud_path)
        label=self.sen[i]
        
        with processor.as_target_processor():
            label_en = processor(label).input_ids
        
        return {'input_values':aud_arr,
                'input_ids':label_en,
               'input_length':len(aud_arr)}
    
    def __len__(self):
        return len(self.df)
        


Due to ran out of gpu time and other issues, I only trained it with first 1000 train samples.

In [None]:
train_dataset=dldata(train_df[0:1000])
valid_dataset=dldata(valid_df[0:200])

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["input_ids"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

'ai4bharat/indicwav2vec_v1_bengali' is the best pretrained model that I have found. IndicWav2Vec is a multilingual speech model pretrained on 40 Indian langauges. This model represents the largest diversity of Indian languages in the pool of multilingual speech models. We fine-tune this model for downstream ASR for 9 languages and obtain state-of-the-art results on 3 public benchmarks, namely MUCS, MSR and OpenSLR.

LINK: https://indicnlp.ai4bharat.org/indicwav2vec/

In [None]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained("ai4bharat/indicwav2vec_v1_bengali",
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

In [None]:
model.freeze_feature_extractor()

I have started this competetion only 1 week ago. That's why I couldn't find enough time to train this model. Also I wwas run out of gpu time. I have only traine dthis model with 1000 train data for 30 epochs. I believe, If I train this model with more train data, then the performance will be much higher.

In [None]:
from transformers import TrainingArguments

step_n = 100 

training_args = TrainingArguments(
  output_dir="./",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=step_n,
  eval_steps=step_n,
  logging_steps=step_n,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
  push_to_hub=False,
)

Now, all instances can be passed to Trainer and we are ready to start training!

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=processor.feature_extractor,
)

### Training

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
trainer.train()

In [None]:
!ls

In [None]:
!zip -r my_model.zip ./checkpoint-900

In [None]:
from IPython.display import FileLink 
FileLink(r'./my_model.zip')

# Resources:
* blog: https://huggingface.co/blog/fine-tune-xlsr-wav2vec2
* pretrained model: https://huggingface.co/ai4bharat/indicwav2vec_v1_bengali
