In [None]:
%%capture
! pip install transformers
! pip install jiwer
! pip install --upgrade wandb
! pip install --upgrade librosa

In [None]:
import os
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import wandb
from kaggle_secrets import UserSecretsClient
from datetime import datetime
from wandb.keras import WandbCallback

import librosa

import warnings
warnings.simplefilter('ignore')

<img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67">

> I will be integrating W&B for visualizations and logging artifacts!
> 
> [SEW - Bengali.ai Speech Recognition on W&B Dashboard](https://wandb.ai/usharengaraju/SEW-Bengali.ai)
> 
> - To get the API key, create an account in the [website](https://wandb.ai/site) .
> - Use secrets to use API Keys more securely 

In [None]:
# Setup user secrets for login
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("api_key") 

# Login
wandb.login(key = wandb_api)

run = wandb.init(project = "SEW-Bengali.ai",
                 name = f"Run_{datetime.now().strftime('%d%m%Y%H%M%S')}", 
                 notes = "add some features",
                 tags = [],
                 config = dict(competition = 'Bengali.ai',
                               _wandb_kernel = 'tensorgirl',
                               batch_size = 32,
                               epochs = 30,
                               learning_rate = 0.005)
)


In [None]:
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
) 

In [None]:
Config = {
    'audio_dir': '/kaggle/input/bengaliai-speech/train_mp3s',
    'model_name': 'facebook/wav2vec2-base',
    'lr': 3e-4,
    'wd': 1e-5,
    'T_0': 10,
    'T_mult': 2,
    'eta_min': 1e-6,
    'nb_epochs': 5,
    'train_bs': 16,
    'valid_bs': 16,
    'sampling_rate': 16000,
}

In [None]:
def read_audio(mp3_path, target_sr=16000):
    audio, sr = librosa.load(mp3_path, sr=32000)
    audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    return audio_array

def construct_vocab(texts):
    all_text = " ".join(texts)
    vocab = list(set(all_text))
    return vocab

def wandb_log(**kwargs):
    for k, v in kwargs.items():
        wandb.log({k: v})

def save_vocab(dataframe):
    vocab = construct_vocab(dataframe['sentence'].tolist())
    vocab_dict = {v: k for k, v in enumerate(vocab)}
    vocab_dict["__"] = vocab_dict[" "]
    _ = vocab_dict.pop(" ")
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open('vocab.json', 'w') as fl:
        json.dump(vocab_dict, fl)

    print("Created Vocab file!")

In [None]:
class ASRDataset(Dataset):
    def __init__(self, df, config, is_test=False):
        self.df = df
        self.config = config
        self.is_test = is_test
    
    def __getitem__(self, idx):
        # First read and pre-process the audio file
        audio = read_audio(self.df.loc[idx]['path'])
        audio = processor(
            audio, 
            sampling_rate=self.config['sampling_rate']
        ).input_values[0]
        
        if self.is_test:
            return {'audio': audio, 'label': -1}
        else:
            # If we are training/validating, also process the labels (actual sentences)
            with processor.as_target_processor():
                labels = processor(self.df.loc[idx]['sentence']).input_ids
            return {'audio': audio, 'label': labels}
        
    def __len__(self):
        return len(self.df)
    
def ctc_data_collator(batch):
    input_features = [{"input_values": sample["audio"]} for sample in batch]
    label_features = [{"input_ids": sample["label"]} for sample in batch]
    batch = processor.pad(
        input_features,
        padding=True,
        return_tensors="pt",
    )
    with processor.as_target_processor():
        labels_batch = processor.pad(
            label_features,
            padding=True,
            return_tensors="pt",
        )
        
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
    batch["labels"] = labels
    return batch

In [None]:
def train_one_epoch(model, train_loader, optimizer, device='cuda:0'):
    model.train()
    pbar = tqdm(train_loader, total=len(train_loader))
    avg_loss = 0
    for data in pbar:
        data = {k: v.to(device) for k, v in data.items()}
        loss = model(**data).loss
        loss_itm = loss.item()
        
        avg_loss += loss_itm
        pbar.set_description(f"loss: {loss_itm:.4f}")
        wandb_log(train_step_loss=loss_itm)
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
    return avg_loss / len(train_loader)

@torch.no_grad()
def valid_one_epoch(model, valid_loader, device='cuda:0'):
    pbar = tqdm(valid_loader, total=len(valid_loader))
    avg_loss = 0
    for data in pbar:
        data = {k: v.to(device) for k, v in data.items()}
        loss = model(**data).loss
        loss_itm = loss.item()
        
        avg_loss += loss_itm
        pbar.set_description(f"val_loss: {loss_itm:.4f}")
        wandb_log(valid_step_loss=loss_itm)

    return avg_loss / len(valid_loader)

# **<span style="color:#F7B2B0;">SEW (Squeezed and Efficient Wav2vec) </span>**

[Source1](https://arxiv.org/pdf/2109.06870.pdf) 
[Source2](https://huggingface.co/docs/transformers/model_doc/sew)

SEW (Squeezed and Efficient Wav2vec) proposed in [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/pdf/2109.06870.pdf) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi introduces a pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a variety of training setups. SEW achieves a 1.9x inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. 

W2V2 consists of a waveform feature extractor and a context network. The waveform feature extractor generates a sequence of continuous feature vectors, each encoding a small segment of audio, and a context network that maps these vectors to context-dependent representations. 
The features extracted by the waveform feature extractor are masked out during pre-trained and discretized as prediction targets and these features are not seen by the context network.


![](https://i.imgur.com/Oet3lDD.png)

In [None]:
run = wandb.init(
    project='bengalispeech',
    config=Config,
    name='sew',
)

In [None]:
from transformers import SEWForCTC
df = pd.read_csv("/kaggle/input/bengaliai-speech/train.csv")

# Get a paths feature for reading in during dataloading
df['path'] = df['id'].apply(lambda x: os.path.join(Config['audio_dir'], x+'.mp3'))
train_df = df[df['split'] == 'train'].sample(frac=.005).reset_index(drop=True)
valid_df = df[df['split'] == 'valid'].sample(frac=.005).reset_index(drop=True)
print(f"Training on samples: {len(train_df)}, Validation on samples: {len(valid_df)}")

# Construct and save the vocab file
save_vocab(df)

# Init the tokenizer, feature_extractor, processor and model
tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="__"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=Config['sampling_rate'], 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer
)

model = SEWForCTC.from_pretrained("asapp/sew-tiny-100k-ft-ls100h",
    ctc_loss_reduction="mean", 
    ignore_mismatched_sizes=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(tokenizer),
)
wandb.watch(model)

# Freeze the feature encoder part since we won't be training it
model.to('cuda')
model.freeze_feature_encoder()
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=Config['lr'], 
    weight_decay=Config['wd']
)

# Construct training and validation dataloaders
train_ds = ASRDataset(train_df, Config)
valid_ds = ASRDataset(valid_df, Config)

train_loader = DataLoader(
    train_ds, 
    batch_size=Config['train_bs'], 
    collate_fn=ctc_data_collator, 
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=Config['valid_bs'],
    collate_fn=ctc_data_collator,
)

# Train the model
best_loss = float('inf')
for epoch in range(Config['nb_epochs']):
    print(f"{'='*40} Epoch: {epoch+1} / {Config['nb_epochs']} {'='*40}")
    train_loss = train_one_epoch(model, train_loader, optimizer)
    valid_loss = valid_one_epoch(model, valid_loader)
    wandb_log(train_loss=train_loss, val_loss=valid_loss)
    print(f"train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}")

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), f"sew_base_bengaliAI.pt")
        print(f"Saved the best model so far with val_loss: {valid_loss:.4f}")

In [None]:
# Once training is done, 
wandb.finish()

# **<span style="color:#F7B2B0;">Tune Hyperparameters with Wandb Sweeps </span>**

Use W&B Sweeps to automate hyperparameter search and visualize rich, interactive experiment tracking. Pick from popular search methods such as Bayesian, grid search, and random to search the hyperparameter space. Scale and parallelize sweep across one or more machines.

In [None]:
Config['train_bs']=32
Config['val_bs']=32

In [None]:
df = pd.read_csv("/kaggle/input/bengaliai-speech/train.csv")

# Get a paths feature for reading in during dataloading
df['path'] = df['id'].apply(lambda x: os.path.join(Config['audio_dir'], x+'.mp3'))
train_df = df[df['split'] == 'train'].sample(frac=.005).reset_index(drop=True)
valid_df = df[df['split'] == 'valid'].sample(frac=.005).reset_index(drop=True)
print(f"Training on samples: {len(train_df)}, Validation on samples: {len(valid_df)}")

# Construct and save the vocab file
save_vocab(df)

# Init the tokenizer, feature_extractor, processor and model
tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="__"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=Config['sampling_rate'], 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer
)
# Construct training and validation dataloaders
train_ds = ASRDataset(train_df, Config)
valid_ds = ASRDataset(valid_df, Config)

train_loader = DataLoader(
    train_ds, 
    batch_size=Config['train_bs'], 
    collate_fn=ctc_data_collator, 
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=Config['valid_bs'],
    collate_fn=ctc_data_collator,
)
      

def main():
    run = wandb.init(project = 'bengalispeech',name='sweep')
    
    model = SEWForCTC.from_pretrained("asapp/sew-tiny-100k-ft-ls100h",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(tokenizer),
    )
    wandb.watch(model)

    # Freeze the feature encoder part since we won't be training it
    model.to('cuda')
    model.freeze_feature_encoder()
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=wandb.config.learning_rate, 
        weight_decay=Config['wd']
    )


    # Train the model
    best_loss = float('inf')
    for epoch in range(wandb.config.n_epochs):
        print(f"{'='*40} Epoch: {epoch+1} / {wandb.config.n_epochs} {'='*40}")
        train_loss = train_one_epoch(model, train_loader, optimizer)
        valid_loss = valid_one_epoch(model, valid_loader)
        wandb_log(train_loss=train_loss, val_loss=valid_loss)
        print(f"train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}")

        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), f"sewtiny.pt")
            print(f"Saved the best model so far with val_loss: {valid_loss:.4f}")
        
    wandb.finish()
    
sweep_configuration = {
    'method': 'bayes',  # random, grid or bayes
    'name': 'sweep-bayes',
    'metric': {'goal': 'minimize', 'name': 'loss'},
    'parameters': 
    {
        'n_epochs': {'values': [5, 10]},
        'learning_rate': {'max': 0.001, 'min': 0.00001},
     }
}
sweep_id = wandb.sweep(sweep=sweep_configuration,project='bengalispeech')
wandb.agent(sweep_id, function=main, count=10)

# **<span style="color:#F7B2B0;">WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing </span>**

[Source](https://arxiv.org/pdf/2110.13900.pdf)


Self-supervised learning (SSL) achieves great success in speech recognition, while limited exploration has been attempted for other speech processing tasks. As speech signal contains multi-faceted information including speaker identity,paralinguistics, spoken content, etc., learning universal representations for all speech tasks is challenging. To tackle the problem, we propose a new pre-trained model, WavLM, to solve full-stack downstream speech tasks. WavLM jointly learns
masked speech prediction and denoising in pre-training. By this means, WavLM does not only keep the speech content modeling capability by the masked speech prediction, but also improves the
potential to non-ASR tasks by the speech denoising. In addition,WavLM employs gated relative position bias for the Transformer structure to better capture the sequence ordering of input speech.
We also scale up the training dataset from 60k hours to 94k hours. WavLM Large achieves state-of-the-art performance on the SUPERB benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks.

![](https://i.imgur.com/RY4Xw3J.png)

In [None]:
run = wandb.init(
    project='bengalispeech',
    config=Config,
    name='wavlm',
)

In [None]:
Config['train_bs']=4
Config['val_bs']=4

In [None]:
from transformers import WavLMForCTC
df = pd.read_csv("/kaggle/input/bengaliai-speech/train.csv")

# Get a paths feature for reading in during dataloading
df['path'] = df['id'].apply(lambda x: os.path.join(Config['audio_dir'], x+'.mp3'))
train_df = df[df['split'] == 'train'].sample(frac=.005).reset_index(drop=True)
valid_df = df[df['split'] == 'valid'].sample(frac=.005).reset_index(drop=True)
print(f"Training on samples: {len(train_df)}, Validation on samples: {len(valid_df)}")

# Construct and save the vocab file
save_vocab(df)

tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="__"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=Config['sampling_rate'], 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer
)

model = WavLMForCTC.from_pretrained("patrickvonplaten/wavlm-libri-clean-100h-base-plus",
    ctc_loss_reduction="mean", 
    ignore_mismatched_sizes=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(tokenizer),
)
wandb.watch(model)

# Freeze the feature encoder part since we won't be training it
model.to('cuda')
model.freeze_feature_encoder()
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=Config['lr'], 
    weight_decay=Config['wd']
)

# Construct training and validation dataloaders
train_ds = ASRDataset(train_df, Config)
valid_ds = ASRDataset(valid_df, Config)

train_loader = DataLoader(
    train_ds, 
    batch_size=Config['train_bs'], 
    collate_fn=ctc_data_collator, 
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=Config['valid_bs'],
    collate_fn=ctc_data_collator,
)

# Train the model
best_loss = float('inf')
for epoch in range(Config['nb_epochs']):
    print(f"{'='*40} Epoch: {epoch+1} / {Config['nb_epochs']} {'='*40}")
    train_loss = train_one_epoch(model, train_loader, optimizer)
    valid_loss = valid_one_epoch(model, valid_loader)
    wandb_log(train_loss=train_loss, val_loss=valid_loss)
    print(f"train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}")

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), f"wavlm_base_bengaliAI.pt")
        print(f"Saved the best model so far with val_loss: {valid_loss:.4f}")

In [None]:
wandb.finish()

# **<span style="color:#F7B2B0;">References </span>**

All public notebooks of this competition

https://arxiv.org/pdf/2109.06870.pdf

https://huggingface.co/docs/transformers/model_doc/sew

https://arxiv.org/pdf/2110.13900.pdf
