In [1]:
!cp -r ../input/python-packages2 ./
!tar xvfz ./python-packages2/jiwer.tgz
!pip install ./jiwer/jiwer-2.3.0-py3-none-any.whl -f ./ --no-index
!tar xvfz ./python-packages2/normalizer.tgz
!pip install ./normalizer/bnunicodenormalizer-0.0.24.tar.gz -f ./ --no-index
!tar xvfz ./python-packages2/pyctcdecode.tgz
!pip install ./pyctcdecode/attrs-22.1.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/exceptiongroup-1.0.0rc9-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/hypothesis-6.54.4-py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pygtrie-2.5.0.tar.gz -f ./ --no-index --no-deps
!pip install ./pyctcdecode/sortedcontainers-2.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps
!pip install ./pyctcdecode/pyctcdecode-0.4.0-py2.py3-none-any.whl -f ./ --no-index --no-deps

!tar xvfz ./python-packages2/pypikenlm.tgz
!pip install ./pypikenlm/pypi-kenlm-0.1.20220713.tar.gz -f ./ --no-index --no-deps

! python -m pip install --no-index --find-links=../input/install-notebook -r ../input/install-notebook/requirements.txt

!pip install /kaggle/input/transformers/transformers-4.33.1-py3-none-any.whl

jiwer/
jiwer/jiwer-2.3.0-py3-none-any.whl
jiwer/python-Levenshtein-0.12.2.tar.gz
jiwer/setuptools-65.3.0-py3-none-any.whl
Looking in links: ./
Processing ./jiwer/jiwer-2.3.0-py3-none-any.whl
INFO: pip is looking at multiple versions of jiwer to determine which version is compatible with other requirements. This could take a while.
[31mERROR: Could not find a version that satisfies the requirement python-Levenshtein==0.12.2 (from jiwer) (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for python-Levenshtein==0.12.2[0m[31m
[0mnormalizer/
normalizer/bnunicodenormalizer-0.0.24.tar.gz
Looking in links: ./
Processing ./normalizer/bnunicodenormalizer-0.0.24.tar.gz
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: bnunicodenormalizer
  Building wheel for bnunicodenormalizer (setup.py) ... [?25l- done
[?25h  Created wheel for bnunicodenormalizer: filename=bnunicodenormalizer-0.0.24-py3-none-any.whl s

In [2]:
#Main libraries & misc
import resource
import os
import random
import re
import argparse
import gc
import pickle
from tqdm import tqdm

#Numeric
import pandas as pd
import numpy as np

#Challenge specific
from bnunicodenormalizer import Normalizer
import librosa
import pyctcdecode
from pyctcdecode import BeamSearchDecoderCTC

#Deep Learning
import torch
import torch.nn as nn
import pytorch_lightning
from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2Config, Wav2Vec2ForCTC
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from torch.nn import TransformerEncoder, TransformerEncoderLayer

random.seed(42)



# Config

In [3]:
#Competition config
class CCFG:
    #Data
    train_data = "/kaggle/input/bengaliai-speech/train_mp3s"
    test_data = "/kaggle/input/bengaliai-speech/test_mp3s"
    
    #Punctuation model
    punc_base = "/kaggle/input/xlm-roberta-large/xlm-roberta-large"
    punc_tokenizer = "/kaggle/input/xlm-roberta-large/xlm-roberta-large"
    punc_weights = "/kaggle/input/punct-correct-roberta-bn/xlm-roberta-large-bn.pt"
    
    #ASR processor & models
    processor = "/kaggle/input/v20-processor/best_v20_processor"
    small_model = "/kaggle/input/v35-210ksteps"
    large_model = "/kaggle/input/v32-130k"
    ensemble_model = "/kaggle/input/ensemble-v1/pytorch_model.bin"
    
    #Decoder
    decoder = "/kaggle/input/llm-pruned-00011/new_model_bin_mixed"
    
    #Neural rescoring [not used in the final submission due memory restrictions]
    neural_rescoring = "/kaggle/input/neural-rescoring"

In [4]:
device = torch.device('cuda:0')

# Functions and params

In [5]:
bnorm = Normalizer()

def normalize(sen):
    """
    Normalize a sentence by applying the 'bnorm' Normalizer to each word in the sentence.

    Args:
        sen (str): The input sentence to be normalized.

    Returns:
        str: The normalized sentence where each word has been normalized using 'bnorm'.
    """
    _words = [bnorm(word)['normalized'] for word in sen.split()]
    return " ".join([word for word in _words if word is not None])

In [6]:
TOKEN_IDX = {
    'bert': {
        'START_SEQ': 101,
        'PAD': 0,
        'END_SEQ': 102,
        'UNK': 100
    },
    'xlm': {
        'START_SEQ': 0,
        'PAD': 2,
        'END_SEQ': 1,
        'UNK': 3
    },
    'roberta': {
        'START_SEQ': 0,
        'PAD': 1,
        'END_SEQ': 2,
        'UNK': 3
    },
    'albert': {
        'START_SEQ': 2,
        'PAD': 0,
        'END_SEQ': 3,
        'UNK': 1
    },
}

punctuation_dict = {'O': 0, 'COMMA': 1, 'PERIOD': 2, 'QUESTION': 3}

MODELS = {
    'xlm-roberta-base': (XLMRobertaModel, XLMRobertaTokenizer, 768, 'roberta'),
    'xlm-roberta-large': (XLMRobertaModel, XLMRobertaTokenizer, 1024, 'roberta')
}

class DeepPunctuation(nn.Module):
    """
    Initialize a Bengali specific punctuation model.

    Args:
        pretrained_model (str): The name of the pretrained model to use.
        freeze_bert (bool): Whether to freeze the parameters of the BERT layer.
        lstm_dim (int): The dimension of the LSTM hidden state. Set to -1 to use the BERT dimension.

    """
    def __init__(self, pretrained_model, freeze_bert=False, lstm_dim=-1):
        super(DeepPunctuation, self).__init__()
        self.output_dim = len(punctuation_dict)
        self.bert_layer = MODELS[pretrained_model][0].from_pretrained(CCFG.punc_base)
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        bert_dim = MODELS[pretrained_model][2]
        if lstm_dim == -1:
            hidden_size = bert_dim
        else:
            hidden_size = lstm_dim
        self.lstm = nn.LSTM(input_size=bert_dim, hidden_size=hidden_size, num_layers=1, bidirectional=True)
        self.linear = nn.Linear(in_features=hidden_size*2, out_features=len(punctuation_dict))

    def forward(self, x, attn_masks):
        if len(x.shape) == 1:
            x = x.view(1, x.shape[0])
        x = self.bert_layer(x, attention_mask=attn_masks)[0]
        x = torch.transpose(x, 0, 1)
        x, (_, _) = self.lstm(x)
        x = torch.transpose(x, 0, 1)
        x = self.linear(x)
        return x

def inference_punc(text):

    text = re.sub(r"[,:\-–.!;?]", '', text)
    words_original_case = text.split()
    words = text.lower().split()

    word_pos = 0
    sequence_len = 256
    result = ""
    decode_idx = 0
    punctuation_map = {0: '', 1: ',', 2: '.', 3: '?'}
    punctuation_map[2] = '।'


    while word_pos < len(words):
        x = [TOKEN_IDX[token_style]['START_SEQ']]
        y_mask = [0]

        while len(x) < sequence_len and word_pos < len(words):
            tokens = tokenizer.tokenize(words[word_pos])
            if len(tokens) + len(x) >= sequence_len:
                break
            else:
                for i in range(len(tokens) - 1):
                    x.append(tokenizer.convert_tokens_to_ids(tokens[i]))
                    y_mask.append(0)
                x.append(tokenizer.convert_tokens_to_ids(tokens[-1]))
                y_mask.append(1)
                word_pos += 1
        x.append(TOKEN_IDX[token_style]['END_SEQ'])
        y_mask.append(0)
        if len(x) < sequence_len:
            x = x + [TOKEN_IDX[token_style]['PAD'] for _ in range(sequence_len - len(x))]
            y_mask = y_mask + [0 for _ in range(sequence_len - len(y_mask))]
        attn_mask = [1 if token != TOKEN_IDX[token_style]['PAD'] else 0 for token in x]

        x = torch.tensor(x).reshape(1,-1)
        y_mask = torch.tensor(y_mask)
        attn_mask = torch.tensor(attn_mask).reshape(1,-1)
        x, attn_mask, y_mask = x.to(device), attn_mask.to(device), y_mask.to(device)

        with torch.no_grad():
        
            y_predict = deep_punctuation(x, attn_mask)

            #Identify the last word and cut the logits (so just the logits for | and ? are taken into account)
            #We will force the model to output | or ? as last sign
            last_id = torch.where(y_mask != 0)[0][-1].item()
            last_sign = torch.argmax(y_predict[0][last_id][2:4]).item() + 2 

            y_predict = y_predict.view(-1, y_predict.shape[2])
            y_predict = torch.argmax(y_predict, dim=1).view(-1)
                
        for i in range(y_mask.shape[0]):
            if y_mask[i] == 1:
                if i == last_id:
                    result += words_original_case[decode_idx] + punctuation_map[last_sign] + ' '
                else:
                    result += words_original_case[decode_idx] + punctuation_map[y_predict[i].item()] + ' '
                decode_idx += 1

    return result

In [7]:
def punctuation_new(sentence):
    """
    Punctuate a given sentence using a punctuation inference model.

    This function takes an input sentence and attempts to punctuate it using an inference model.
    If the sentence does not end with a Bengali full stop (।), it processes the sentence as is.
    If the sentence ends with ।, it processes the sentence without the final । and then adds it back after punctuation.

    Args:
        sentence (str): The input sentence to be punctuated.

    Returns:
        str: The punctuated sentence.

    """
    try:
    
        if sentence[-1]!="।":
            sentence = inference_punc(sentence).strip()
        else:
            sentence = inference_punc(sentence[:-1]).strip()
    except:
        print("error")
        pass
    return sentence

In [8]:
class W2v2Dataset(torch.utils.data.Dataset):
    """
    Custom PyTorch dataset for speech data processing with optional denoising and debugging.

    Args:
        paths (list): List of file paths to the audio files.
        denoising (bool): Whether to apply denoising to the audio. -> not used in the final submission
        debug_style (bool): Whether to use debug-style data loading. -> 

    Attributes:
        paths (list): List of file paths to the audio files.
        denoising (bool): Flag indicating whether denoising is applied.
        debug (bool): Flag indicating whether debug-style data loading is used.
    """
    def __init__(self, paths, denoising=False, debug_style=False):
        self.paths = paths
        self.denoising = denoising
        self.debug = debug_style

    def __getitem__(self, idx):
        apath = self.paths[idx]
        
        if self.debug:
            waveform, sample_rate = librosa.load(f"{CCFG.train_data}/{apath}", sr=16000)
        else:
            waveform, sample_rate = librosa.load(f"{CCFG.test_data}/{apath}", sr=16000)
            
        audio = processor(waveform, sampling_rate=sample_rate).input_values[0]
        
        if self.denoising:
            audio = denoise_infer(audio)
        
        id_name = self.paths[idx].replace('.mp3','')
        
        return audio, id_name

    def __len__(self):
        return len(self.paths)

# Loading the testfiles

In [9]:
test_files = sorted(os.listdir(CCFG.test_data))

In [10]:
debug_mode = False

test_dataset = W2v2Dataset(test_files, denoising=False, debug_style=debug_mode)
test_loader =  torch.utils.data.DataLoader(test_dataset,
                             batch_size=1,
                             shuffle=False,
                             num_workers=os.cpu_count())

# Creating, calculating and saving logits

In [11]:
processor = Wav2Vec2Processor.from_pretrained(CCFG.processor)

In [12]:
class CustomModel(nn.Module):
    """
    Custom model to combine features of finetuned models.
    Inspired by this paper: https://arxiv.org/pdf/2206.05518.pdf

    This model combines two pretrained Wav2Vec2 models, processes their features with a transformer encoder,
    and produces output logits for transcription. It supports training with CTC loss.

    Attributes:
        model1 (nn.Module): The first pretrained Wav2Vec2 model.
        model2 (nn.Module): The second pretrained Wav2Vec2 model.
        encoder_layers (nn.TransformerEncoderLayer): The transformer encoder layer.
        transformer_encoder (nn.TransformerEncoder): The transformer encoder.
        lm_head (nn.Linear): The linear layer for output logits.
        config: The configuration of the model.

    Methods:
        freeze_model(model): Freezes the model's parameters to prevent further training.
        forward(input_values, labels=None, **kwargs): Forward pass of the model.

    """
    def __init__(self):
        super().__init__()
        self.model1 = self.freeze_model(Wav2Vec2Model.from_pretrained(CCFG.small_model))
        self.model2 = self.freeze_model(Wav2Vec2Model.from_pretrained(CCFG.large_model))
        self.encoder_layers = nn.TransformerEncoderLayer(d_model=1024+1280, nhead=6) #hardcoded values, should be initialized
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers=2)
        self.lm_head = nn.Linear(1024+1280, len(processor.tokenizer)) #hardcoded values, should be initialized
        self.config = self.model1.config

    def freeze_model(self, model):
        for param in model.parameters():
            param.requires_grad = False
        return model

    def forward(self, input_values, labels=None, **kwargs):

        with torch.no_grad():
            feature1 = self.model1(input_values=input_values, output_hidden_states=True).last_hidden_state
            feature2 = self.model2(input_values=input_values, output_hidden_states=True).last_hidden_state

        concatenated_features = torch.cat((feature1, feature2), dim=-1)
        
        encoded_features = self.transformer_encoder(concatenated_features)
        logits = self.lm_head(encoded_features)
        
        if labels is None:
            
            return {'logits': logits}
        
        else: 
            loss = None
            attention_mask = torch.ones_like(input_values, dtype=torch.long)
            input_lengths = self.model1._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
            labels_mask = labels >= 0
            target_lengths = labels_mask.sum(-1)
            flattened_targets = labels.masked_select(labels_mask)

            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

            loss = nn.functional.ctc_loss(
                log_probs,
                flattened_targets,
                input_lengths,
                target_lengths,
                blank=62,
                reduction='mean',
                zero_infinity=True,
            )

            return {'loss': loss, 'logits': logits}

In [13]:
new_model = CustomModel()
new_model.load_state_dict(torch.load(CCFG.ensemble_model))
new_model.to(device)
print("")




In [14]:
torch.cuda.empty_cache()
gc.collect()

38

In [15]:
os.mkdir("/kaggle/working/logits")

In [16]:
with torch.no_grad():
    for i, (aud, id_n) in enumerate(tqdm(test_loader)):

        aud = aud.to(device)
        logits = new_model(aud)['logits']
        logits = logits.detach().cpu().numpy()
        np.save(f'/kaggle/working/logits/{id_n[0]}', logits)

100%|██████████| 3/3 [00:15<00:00,  5.17s/it]


In [17]:
del new_model, logits, test_dataset, test_loader, processor
torch.cuda.empty_cache()
gc.collect()

0

In [18]:
torch.cuda.empty_cache()
gc.collect()

0

# Creating and decoding the logits

In [19]:
decoder = BeamSearchDecoderCTC.load_from_dir(CCFG.decoder)

In [20]:
best_params = {'alpha': 0.5, 'beta': 1.5, 'beam_width': 100} #Standard params
custom_decoding = False
neural_rescoring = False

if neural_rescoring:
    os.mkdir("/kaggle/working/neural_rescoring")
    
if custom_decoding:
    decoder.reset_params(
            alpha=0.75
        )

ids = []
predictions = []

for index, file in enumerate(test_files):

    logits = np.load(f'/kaggle/working/logits/{file.replace(".mp3", ".npy")}')
    for l in logits:
        if neural_rescoring:
            sentence = decoder.decode_beams(l, prune_history=False)[:5]
            sentence = [(x[0],x[-1]) for x in sentence]
            with open(f"/kaggle/working/neural_rescoring/{file.replace('.mp3','.pkl')}", "wb") as f:
                pickle.dump(sentence, f)
        else:
            sentence = decoder.decode(l)
            predictions.append(sentence)
    ids.append(file.replace(".mp3", ""))

In [21]:
decoder.cleanup() #-> important step, prevents from out of memory -> https://github.com/kensho-technologies/pyctcdecode/pull/111

del decoder
gc.collect()

11

In [22]:
torch.cuda.empty_cache()
gc.collect()

0

# Rescoring hypothesis

In [23]:
def rescore_hypotheses(hypotheses, lm_weight=1.0):
    rescored_hypotheses = []
    for transcription, asr_score in hypotheses:

        input_ids = neural_rescore.tokenizer.encode(transcription, truncation=True, max_length=128, return_tensors='pt')

        with torch.no_grad():
            outputs = neural_rescore.model(input_ids.to(device), labels=input_ids.to(device))
            log_likelihood = outputs.loss.item()

        transformer_score = lm_weight * (-log_likelihood)
        combined_score = asr_score + transformer_score 
        rescored_hypotheses.append((transcription, combined_score))
        
    return sorted(rescored_hypotheses, key=lambda x: x[1], reverse=True)

In [24]:
if neural_rescoring:
    neural_rescore = pipeline('text-generation',model="/kaggle/input/neural-rescoring", tokenizer='/kaggle/input/neural-rescoring', device=device)
    
    predictions = []
    for index, file in enumerate(test_files):
        with open(f"/kaggle/working/neural_rescoring/{file.replace('.mp3','.pkl')}", "rb") as f:
            loaded_list = pickle.load(f)
        predictions.append(rescore_hypotheses(loaded_list)[0][0])
        
    del neural_rescore
    torch.cuda.empty_cache()
    gc.collect()

# Adding punctuation

In [25]:
tokenizer = XLMRobertaTokenizer.from_pretrained(CCFG.punc_base)
token_style = MODELS['xlm-roberta-large'][3]

In [26]:
deep_punctuation = DeepPunctuation('xlm-roberta-large', freeze_bert=False, lstm_dim=-1)
deep_punctuation.to(device)
deep_punctuation.load_state_dict(torch.load(CCFG.punc_weights))
deep_punctuation.eval()

DeepPunctuation(
  (bert_layer): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias

In [27]:
predictions = [normalize(punctuation_new(normalize(sentence))) for sentence in predictions]

In [28]:
del deep_punctuation
torch.cuda.empty_cache()
gc.collect()

7

# Cleaning and submission

In [29]:
!rm -r /kaggle/working/*

In [30]:
pred_df = pd.DataFrame({"id":ids,"sentence":predictions})
pred_df["sentence"] = [x if len(x) > 0 else "।" for x in pred_df["sentence"]]
pred_df = pred_df.sort_values(by='id')
pred_df.to_csv("submission.csv", index=False)

In [31]:
pred_df.head(3)

Unnamed: 0,id,sentence
0,0f3dac00655e,একটু বয়স হলে একটি বিদেশি।
1,a9395e01ad21,কী কারণে তুমি এতাবৎ কাল পর্যন্ত এই দারুণ দৈবদু...
2,bf36ea8b718d,এ কারণে সরকার নির্ধারিত হারে পরিবহন জনিত ক্ষতি...
