In [2]:
import pandas as pd
import gc
import numpy as np
import json
import sys
#import nltk

import re
import unicodedata

from bnunicodenormalizer import Normalizer 
bnorm = Normalizer()

In [19]:
from bnlp import NLTKTokenizer

bnltk = NLTKTokenizer()

In [20]:
import string
def normalize_word(token):
    _token = remove_punctuation(token) 
    _token = replace_numeric(_token, by_single_digit=True)
    _token = '<num>' if _token == '#' else _token 
    return _token.strip().lower()


def remove_punctuation(text, punctiation_extended=string.punctuation + """"„“‚‘"""):
    return ''.join(c for c in text if c not in punctiation_extended)


def replace_numeric(text, numeric_pattern=re.compile('[0-9]+'), digit_pattern=re.compile('[0-9]'), repl='#',
                    by_single_digit=False):
    return re.sub(numeric_pattern, repl, text) if by_single_digit else re.sub(digit_pattern, repl, text)


def contains_numeric(text):
    return any(char.isdigit() for char in text)

In [3]:
sys.path.append("../")

In [4]:
from experiments.normalize import *

# Tokenized and processed files

In [5]:
def _ai4bharat_tokenized_processed():
    '''
    https://objectstore.e2enetworks.net/ai4b-public-nlu-nlg/indic-corp-frozen-for-the-paper-oct-2022/bn.txt
    '''
    i = 0
    sentencelist = []
    with open('base_files/bn.txt', 'r') as file:
        for line in file:
           #print(line, end='')  # Each line already ends with a newline character, so specify end='' to avoid printing double newlines
            batch_sentence = bnltk.sentence_tokenize(line.strip()) 
            batch_sentence = [bnltk.word_tokenize(x) for x in batch_sentence]
            batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
            batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
            sentencelist.append(batch_sentence)
            i+=1
            if i % 5000000 == 0:
                print(f"{i} samples are processed")
                #break
    sentencelist = [item for sublist in sentencelist for item in sublist]
    sentencelist = [line for line in sentencelist if line != '']
    sentencelist = list(set(sentencelist))
    print(f"{len(sentencelist)} unique sentences")
    return sentencelist

In [6]:
%%time
ekstep_sentences_tokenized_normalized = _ai4bharat_tokenized_processed()

5000000 samples are processed
10000000 samples are processed
15000000 samples are processed
20000000 samples are processed
25000000 samples are processed
30000000 samples are processed
35000000 samples are processed
40000000 samples are processed
62858539 unique sentences
CPU times: user 2h 17min 43s, sys: 17.2 s, total: 2h 18min
Wall time: 2h 18min 7s


In [7]:
def _ai4bharat_lm_tokenized_processed():
    '''
    https://storage.googleapis.com/vakyansh-open-models/language_model_text/bengali.zip
    '''
    i=0
    sentencelist = []
    with open("base_files/lm_ai4bharat.txt", 'r', encoding='utf-8') as file:
        for i,line in enumerate(file):
            #sentencelist.append(line.strip())
            batch_sentence = bnltk.sentence_tokenize(line.strip()) 
            batch_sentence = [bnltk.word_tokenize(x) for x in batch_sentence]
            batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
            batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
            sentencelist.append(batch_sentence)
            i+=1
            if i % 5000000 == 0:
                print(f"{i} samples are processed")
    sentencelist = [item for sublist in sentencelist for item in sublist]
    sentencelist = list(set(sentencelist))
    print(f"{len(sentencelist)} unique sentences")
    return sentencelist

In [8]:
%%time
ai4bharat_data_tokenized_normalized = _ai4bharat_lm_tokenized_processed()

5000000 samples are processed
10000000 samples are processed
15000000 samples are processed
20000000 samples are processed
25000000 samples are processed
30000000 samples are processed
30166470 unique sentences
CPU times: user 52min 28s, sys: 7.02 s, total: 52min 35s
Wall time: 52min 35s


In [9]:
def _lm_train_tokenized_processed():
    '''
    https://storage.googleapis.com/vakyansh-open-models/language_model_text/bengali.zip
    '''
    sentencelist = []
    with open('base_files/lm_train.txt', 'r') as file:
        for line in file:
            batch_sentence = bnltk.sentence_tokenize(line.strip()) 
            batch_sentence = [bnltk.word_tokenize(x) for x in batch_sentence]
            batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
            batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
            sentencelist.append(batch_sentence)
    sentencelist = [item for sublist in sentencelist for item in sublist]
    sentencelist = list(set(sentencelist))
    print(f"{len(sentencelist)} unique sentences")
    return sentencelist

In [10]:
%%time
lm_train_tokenized_processed = _lm_train_tokenized_processed()

152917 unique sentences
CPU times: user 17.9 s, sys: 7.98 ms, total: 17.9 s
Wall time: 17.9 s


In [11]:
def _openslr_tokenized_processed():
    '''
    https://openslr.elda.org/resources/53/utt_spk_text.tsv
    '''
    sentencelist = []
    with open('base_files/utt_spk_text.tsv', 'r') as file:
        for line in file:
            # Split the line by tabs
            values = line.strip().split('\t')
            batch_sentence = bnltk.sentence_tokenize(values[-1].strip()) 
            batch_sentence = [bnltk.word_tokenize(x) for x in batch_sentence]
            batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
            batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
            sentencelist.append(batch_sentence)
    sentencelist = [item for sublist in sentencelist for item in sublist]
    sentencelist = list(set(sentencelist))
    print(f"{len(sentencelist)} unique sentences")
    return sentencelist

In [12]:
%%time
openslr_tokenized_processed = _openslr_tokenized_processed()

109373 unique sentences
CPU times: user 12.7 s, sys: 8 ms, total: 12.7 s
Wall time: 12.7 s


In [13]:
def _kaggle_trains_tokenized_processed():
    train_df = pd.read_csv("../data/train.csv")
    train_df_list = train_df['sentence'].tolist()
    batch_sentence = [bnltk.word_tokenize(x) for x in train_df_list]
    batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
    batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
    #train_df_final = [item for sublist in batch_sentence for item in sublist]
    train_df_final = list(set(batch_sentence))
    
    df_dl = pd.read_csv("base_files/train_dl_sprint.csv")
    df_dl_list = df_dl['sentence'].tolist()
    batch_sentence = [bnltk.word_tokenize(x) for x in df_dl_list]
    batch_sentence = [[normalize_word(x) for x in f] for f in batch_sentence]
    batch_sentence = [' '.join(w for w in sent if w).strip() for sent in batch_sentence]
    #df_dl_final = [item for sublist in batch_sentence for item in sublist]
    df_dl_final = list(set(batch_sentence))
    
    sentencelist = list(set(train_df_final+df_dl_final))
    
    print(f"{len(sentencelist)} unique sentences")
    return sentencelist

In [14]:
%%time
kaggle_trains_tokenized_processed = _kaggle_trains_tokenized_processed()

463124 unique sentences
CPU times: user 1min 46s, sys: 80.1 ms, total: 1min 46s
Wall time: 1min 47s


# Saving single lists

In [17]:
from transformers import pipeline
from bnunicodenormalizer import Normalizer
from tqdm import tqdm

In [28]:
txt_overview = {'kaggle_trains_tokenized_processed':kaggle_trains_tokenized_processed,
 'ekstep_sentences_tokenized_normalized':ekstep_sentences_tokenized_normalized,
 'ai4bharat_data_tokenized_normalized':ai4bharat_data_tokenized_normalized,
 'lm_train_tokenized_processed':lm_train_tokenized_processed,
 'openslr_tokenized_processed':openslr_tokenized_processed}

In [30]:
def _save_list(txt_overview):

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    for k,v in txt_overview.items():
        with open(f'processed_files/{k}.txt', 'w') as f:
            for sentence in tqdm(v):
                sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
                f.write(f"{sentence}\n")

In [35]:
_save_list(txt_overview)

100%|███████████████████████████████| 463124/463124 [00:02<00:00, 179161.79it/s]
100%|███████████████████████████| 62858539/62858539 [08:30<00:00, 123177.01it/s]
100%|███████████████████████████| 30166470/30166470 [04:17<00:00, 117098.09it/s]
100%|███████████████████████████████| 152917/152917 [00:00<00:00, 286906.39it/s]
100%|███████████████████████████████| 109373/109373 [00:00<00:00, 368129.61it/s]
100%|███████████████████████████| 27810072/27810072 [03:05<00:00, 149823.01it/s]
100%|█████████████████████████████| 6715229/6715229 [01:05<00:00, 101749.35it/s]
100%|█████████████████████████████| 2182535/2182535 [00:15<00:00, 137385.54it/s]
100%|█████████████████████████████| 8064627/8064627 [00:50<00:00, 161126.30it/s]
100%|████████████████████████████| 11007401/11007401 [04:36<00:00, 39808.72it/s]
100%|███████████████████████████| 23784452/23784452 [03:15<00:00, 121698.40it/s]


# Creating LM

In [5]:
import random
random.seed(42)

In [6]:
source_overview = ['kaggle_trains_tokenized_processed',
 'ekstep_sentences_tokenized_normalized',
 'ai4bharat_data_tokenized_normalized',
 'lm_train_tokenized_processed',
 'openslr_tokenized_processed'
]

In [7]:
def _read_processed(sourcename):
    i = 0
    sentencelist = []
    with open(f'processed_files/{sourcename}.txt', 'r') as file:
        for line in file:
            sentencelist.append(line.strip())
            #if i % 1000 == 0:
            #    break
    return sentencelist

In [8]:
all_sentence_list = {}

for source in source_overview:
    print(source)
    all_sentence_list[source] = _read_processed(source)

kaggle_trains_tokenized_processed
ekstep_sentences_tokenized_normalized
ai4bharat_data_tokenized_normalized
lm_train_tokenized_processed
openslr_tokenized_processed


In [12]:
new_all_sentences = (all_sentence_list['kaggle_trains_tokenized_processed']+
                    all_sentence_list['ekstep_sentences_tokenized_normalized']+
                    all_sentence_list['ai4bharat_data_tokenized_normalized']+
                    all_sentence_list['lm_train_tokenized_processed']+
                    all_sentence_list['openslr_tokenized_processed'])

In [13]:
len(new_all_sentences)

93750423

In [14]:
new_all_sentences = list(set(new_all_sentences))
len(new_all_sentences)

89885456

In [15]:
def _lm_model_dw(all_sentences):
    #import re

    chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

    with open('text_full_v12_base.txt', 'w') as f:
        for sentence in tqdm(all_sentences):
            sentence = normalizeUnicode(sentence.strip(), normalize_nukta=False)
            f.write(f"{sentence}\n")

            #f.write(sentence)
            #f.write(' ')

In [22]:
_lm_model_dw(new_all_sentences)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 89885456/89885456 [12:25<00:00, 120542.06it/s]


In [23]:
!kenlm/build/bin/lmplz -o 5 --prune 0 0 0 1 1 -S 60% < "text_full_v12_base.txt" > "5gram.arpa"

=== 1/5 Counting and sorting n-grams ===
Reading /media/benedikt/T7/text_full_v12_base.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 1231276442 types 8367842
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:100414104 2:7891496448 3:14796556288 4:23674488832 5:34525298688
Statistics:
1 8367842 D1=0.727727 D2=1.04105 D3+=1.29796
2 151674141 D1=0.757378 D2=1.0847 D3+=1.34373
3 514674041 D1=0.839338 D2=1.1802 D3+=1.36572
4 118883901/772172452 D1=0.904396 D2=1.31585 D3+=1.44292
5 96109758/845649270 D1=0.846464 D2=1.46157 D3+=1.73356
Memory estimate for binary LM:
type       MB
probing 19829 assuming -p 1.5
probing 24354 assuming -r models -p 1.5
trie    11423 without quantization
trie     6760 assuming -q 8 -b 8 quantization 
trie     9446 assuming -a 22 array pointer compression
trie     4783 assumin

In [24]:
with open("5gram.arpa", "r") as read_file, open("5gram_correct.arpa", "w") as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and "ngram 1=" in line:
            count=line.strip().split("=")[-1]
            write_file.write(line.replace(f"{count}", f"{int(count)+1}"))
        elif not has_added_eos and "<s>" in line:
            write_file.write(line)
            write_file.write(line.replace("<s>", "</s>"))
            has_added_eos = True
        else:
            write_file.write(line)

In [25]:
!kenlm/build/bin/build_binary -S 60% 5gram_correct.arpa 5gram.binary

Reading /media/benedikt/T7/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [26]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("../experiments/runs/wav2vec_indic_v35/processor")

vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path='5gram_correct.arpa',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained("lms/new_model_arpa")

Loading the LM will be faster if you build a binary file.
Reading /media/benedikt/T7/5gram_correct.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.


In [27]:
from pyctcdecode import BeamSearchDecoderCTC
from pyctcdecode import build_ctcdecoder

from transformers import pipeline
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
from transformers import Wav2Vec2ForCTC
from pyctcdecode import build_ctcdecoder
from bnunicodenormalizer import Normalizer 
import librosa
from jiwer import wer
from transformers import Wav2Vec2ProcessorWithLM,pipeline


processor = Wav2Vec2Processor.from_pretrained("../experiments/runs/wav2vec_indic_v35/processor")


vocab_dict = processor.tokenizer.get_vocab()

vocab_dict = {k: v for k, v in sorted(vocab_dict.items(), key=lambda item: item[1])}
decoder = build_ctcdecoder(
    labels=list(vocab_dict.keys()),
    kenlm_model_path='5gram.binary',
    #alpha=0.2, #alpha 0.2 and beta 0.5 ->
    #beta=0.2
)

processor = Wav2Vec2ProcessorWithLM(
    feature_extractor=processor.feature_extractor,
    tokenizer=processor.tokenizer,
    decoder=decoder,
)

processor.save_pretrained("lms/new_model_bin_mixed")

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
No known unigrams provided, decoding results might be a lot worse.
