In [2]:
# Importing the libraries

import re
from collections import Counter
from collections import defaultdict
from sklearn.metrics import precision_recall_fscore_support
file_path = "hi_100_1.txt"
with open(file_path, "r", encoding="utf-8") as file:
    corpus = file.read()

In [3]:
# Defining all the functions

hindi_vowels = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः']
hindi_matras = ['ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', 'ं', 'ः', '्', 'ृ']
hindi_punctuation_symbols = "#,।,?!.:;‘’“”-…()▁ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"


character_map = {
    'अ': '',
    'आ': 'ा',
    'इ': 'ि',
    'ई': 'ी',
    'उ': 'ु',
    'ऊ': 'ू',
    'ए': 'े',
    'ऐ': 'ै',
    'ओ': 'ो',
    'औ': 'ौ',
    'अं': 'ं',
    'अः': 'ः'
}

# This function is used to extract words
def extract_words(text):
  pattern = r'[\u0900-\u097F]+'
  words = re.findall(pattern, text)
  return words

# Unicode correction function
def correction(s):

  new = []
  halant = '\u094D'
  for i in range(len(s)):

    if(s[i] == '.' or s[i] == ',' or s[i] == '?' or s[i] == '!' or s[i] == ':' or s[i] == ';' or s[i] == '"' or s[i] == "'" or s[i] == '–' or s[i] == '_' or s[i] == '(' or s[i] == ')' or s[i] == '[' or s[i] == ']' or s[i] == '/' or s[i] == '|' or s[i]== ' '):
      continue
    elif(s[i] in hindi_punctuation_symbols):
      continue
    elif(s[i] in hindi_vowels):
      new.append(s[i])
    elif(s[i]=='ा'):
        new.append('आ')
    elif(s[i]=='ि'):
        new.append('इ')
    elif(s[i]=='ी'):
        new.append('ई')
    elif(s[i]=='ु'):
        new.append('उ')
    elif(s[i]=='ू'):
        new.append('ऊ')
    elif(s[i]=='े'):
        new.append('ए')
    elif(s[i]=='ै'):
        new.append('ऐ')
    elif(s[i]=='ो'):
        new.append('ओ')
    elif(s[i]=='ौ'):
        new.append('औ')
    elif(s[i]=='ं'):
        new.append('अं')
    elif(s[i]=='ः'):
        new.append('अः')
    elif(s[i]=='ृ'):
        new.append('र' + halant)
    elif(s[i] == '्'):
      continue
    elif(i+1<len(s) and s[i+1] in hindi_matras):
      new.append(s[i] + halant)
    else:
      new.append(s[i] + halant)
      new.append('अ')
  return new

# Making function to store character in decreasing order of frequency
def frequency_decreasing(array, array_):
  element_freq = Counter(array_)
  sorted_elements = sorted(element_freq.items(), key=lambda x: x[1], reverse=True)
  element_map = {element: freq for element, freq in sorted_elements}
  return element_map

#This function makes bigram
def makebigram(array):
  bigram = []
  for i in range(len(array)):
    for j in range(len(array[i])-1):
      bigram.append(array[i][j] + array[i][j+1])
  return bigram


# This function make syllables
def make_syllable(k):
  new = []
  j=0
  while(j<len(k)):
    if(k[j] in hindi_vowels):
      while(j<len(k) and k[j] in hindi_vowels):
        new.append(k[j])
        j = j+1
    else:
      m = ''
      while(j<len(k) and k[j] not in hindi_vowels):
        m = m + k[j]
        j=j+1
      if(j>=len(k)):
        new.append(m[:-1])
      else:
        new.append(m[:-1] + character_map[k[j]])
      j=j+1
  return new

# This function calculates precision, recall and f1 score
def prf(ground_truth, tokens):

  precisions = []
  recalls = []
  f1_scores = []

  for ground_truth_sublist, model_sublist in zip(ground_truth, tokens):
      ground_truth_set = set(ground_truth_sublist)
      model_set = set(model_sublist)
      true_positives = len(ground_truth_set.intersection(model_set))
      false_positives = len(model_set - ground_truth_set)
      false_negatives = len(ground_truth_set - model_set)

      precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
      recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
      f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

      precisions.append(precision)
      recalls.append(recall)
      f1_scores.append(f1_score)

  average_precision = sum(precisions) / len(precisions)
  average_recall = sum(recalls) / len(recalls)
  average_f1_score = sum(f1_scores) / len(f1_scores)

  return (average_precision, average_recall, average_f1_score)

Question No 1

In [5]:
#Question 1

# words = ["श्रीराम", "संभावना", "मृत्युंजय", "अमरकन्टक", "ईमेल"]
words = ["मृत्युंजय"]

# words = extract_words(corpus)
corrected_unicode=[]

# Storing the corrected unicode in array of list
for i in range(len(words)):
  corrected_unicode.append(correction(words[i]))
print(corrected_unicode)

[['म्', 'र्', 'त्', 'य्', 'उ', 'अं', 'ज्', 'अ', 'य्', 'अ']]


Question No 2

In [18]:
#Question 2 : Unigram and Bigram of Characters

# character_array stores the converted list from array of list (array).
character_array = [char for sublist in corrected_unicode for char in sublist]
# character_bigram stores the bigrams of corrected_unicode
character_bigram = makebigram(corrected_unicode)
# array_map stores the decreasing order of unigram characters
corrected_unicode_map = frequency_decreasing(corrected_unicode, character_array)
# bigram_map stores the decreasing order of bigram characters
bigram_map = frequency_decreasing(character_bigram, character_bigram)

c = 20

# Printing Top 20 unigrams and bigrams characters
print("Top 20 Frequent Unigram Characters:")
for element, freq in corrected_unicode_map.items():
  if(c==0):
    break
  c = c-1
  print(f"{element}: {freq}")
c=20
print("Top 20 Frequent Bigram Characters:")
for element, freq in bigram_map.items():
  if(c==0):
    break
  c = c-1
  print(f"{element}: {freq}")

Top 20 Frequent Unigram Characters:
अ: 7287721
आ: 2991109
ए: 2318442
क्: 2219964
र्: 2140164
ई: 1460305
इ: 1432973
न्: 1334448
स्: 1283708
अं: 1201207
ह्: 1133159
म्: 1053237
त्: 980066
ल्: 919917
ओ: 896588
प्: 805896
य्: 752819
व्: 624743
द्: 607633
उ: 587149
Top 20 Frequent Bigram Characters:
र्अ: 1173271
अर्: 792442
क्अ: 619365
स्अ: 518851
न्अ: 515849
अन्: 437698
क्ए: 407130
प्अ: 405662
अह्: 390412
आर्: 368512
एअं: 359377
अक्: 355687
त्अ: 353275
ल्अ: 333314
न्ए: 328953
म्अ: 324938
क्आ: 314321
अत्: 308837
य्आ: 297778
ह्ऐ: 297199


In [19]:
# Question 3 : Making Syllable

character_map = {
    'अ': '',
    'आ': 'ा',
    'इ': 'ि',
    'ई': 'ी',
    'उ': 'ु',
    'ऊ': 'ू',
    'ए': 'े',
    'ऐ': 'ै',
    'ओ': 'ो',
    'औ': 'ौ',
    'अं': 'ं',
    'अः': 'ः'
}


syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(corrected_unicode)):
  syllable.append(make_syllable(corrected_unicode[i]))
print(syllable)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [26]:
# Unigram and Bigram of Syllable

# syllable_array stores the converted list from array of list (array).
syllable_array = [char for sublist in syllable for char in sublist]

# syllable_bigram stores the bigrams of syllable
syllable_bigram = makebigram(syllable)

#syllable_map stores the decreasing order frequency of syllable
syllable_map = frequency_decreasing(syllable, syllable_array)

# syllable_bigram_map stores the decreasing order frequency of syllable_bigram
syllable_bigram_map = frequency_decreasing(syllable_bigram, syllable_bigram)

c = 20
# Printing Top 20 Unigram and Bigram Syllables
print("Top 20 Frequent Unigram Syllables:")
for element, freq in syllable_map.items():
  if(c==0):
    break;
  c=c-1
  print(f"{element}: {freq}")

c=20
print("Top 20 Frequent Bigram Syllables:")
for element, freq in syllable_bigram_map.items():
  if(c==0):
    break;
  c=c-1

  print(f"{element}: {freq}")

Top 20 Frequent Unigram Syllables:
र: 1008411
अं: 988234
क: 609542
न: 507657
स: 494360
के: 405005
प: 392089
ल: 330146
ने: 328102
का: 307954
त: 306238
है: 297019
म: 295118
मे: 292706
ए: 288662
ह: 278771
अ: 253201
ब: 247540
की: 235320
ग: 226337
Top 20 Frequent Bigram Syllables:
मेअं: 259224
कर: 160043
और: 115671
पर: 99665
इस: 82886
हैअं: 80715
हीअं: 55632
एक: 54576
लिए: 54024
नही: 49367
अप: 45157
कार: 39111
किया: 37430
योअं: 34861
रने: 34496
कहा: 33135
यह: 31419
गया: 30278
सर: 30119
उन: 29754


In [20]:
!pip3 install sentencepiece
!pip3 install torch torchvision torchaudio
!pip install tensorflow
!pip install tensorflow-gpu



Collecting tensorflow-gpu
  Using cached tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[44 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "/Users/tanmaydubey/anaconda3/lib/python3.11/site-packages/setuptools/_vendor/packaging/requirements.py", line 35, in __init__
  [31m   [0m     parsed = _parse_requirement(requirement_string)
  [31m   [0m              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  [31m   [0m   File "/Users/tanmaydubey/anaconda3/lib/python3.11/site-packages/setuptools/_vendor/packaging/_parser.py", line 64, in parse_requirement
  [31m   [0m     return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
  [31m   [0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  

Question No 4

In [21]:
# Question 4

import sentencepiece as spm
from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer
import torch

corpus_file = "hi_100_1.txt"

#Training the Unigram model
spm_model_file = "unigram(1).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file, model_type='unigram')

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: hi_100_1.txt
  input_format: 
  model_prefix: unigram(1).model
  model_type: UNIGRAM
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise

In [29]:
#Training the bpe model

spm_model_file1 = "bpe(1).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file1, model_type='bpe', vocab_size=1000)
spm_model_file2 = "bpe(2).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file2, model_type='bpe', vocab_size=2000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: hi_100_1.txt
  input_format: 
  model_prefix: bpe(1).model
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 

In [30]:
#mbert

tokenizer_mbert1 = BertTokenizer.from_pretrained('bert-base-multilingual-cased' , max_length=1000, truncation=True)
tokenizer_mbert2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased' , max_length=2000, truncation=True)

del_trainer.cc(268) LOG(INFO) Added: freq=28955 size=140 all=18442 active=4713 piece=न्ह
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=24411 size=160 all=20405 active=6676 piece=▁राज
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=21882 size=180 all=22297 active=8568 piece=▁सर
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=19406 size=200 all=24144 active=10415 piece=▁चु
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=19305 min_freq=2919
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=17837 size=220 all=26375 active=3387 piece=गर
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=16215 size=240 all=27984 active=4996 piece=▁जी
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=14802 size=260 all=29833 active=6845 piece=रो
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=13436 size=280 all=31773 active=8785 piece=▁उप
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=12360 size=300 all=32981 active=9993 piece=▁पुलिस
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbol

In [31]:
# Generating tokens from unigram model

#loading the pretrained unigram model
spm_tokenizer = spm.SentencePieceProcessor()
spm_tokenizer.load(spm_model_file + ".model")

unigram_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer.encode_as_pieces(line.strip())
        unigram_tokens.append(line_tokens)

unigram_tokens = [[word.replace('▁', '') for word in sublist] for sublist in unigram_tokens]
# print("Tokens:", unigram_tokens, "\n")

In [32]:
# storing unigram tokens and frequency of unigram tokens
unigram_tokens_array = [char for sublist in unigram_tokens for char in sublist]
unigram_tokens_map = frequency_decreasing(unigram_tokens, unigram_tokens_array)

array_unigram =[]

# Storing the corrected unicode in array of list
for i in range(len(unigram_tokens)):
  array_unigram.append(correction(unigram_tokens[i]))

print("Top 20 frequent unigram bigram characters")
count=0
for element, freq in unigram_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break
# storing characters and bigram characters for unigram token
unigram_array = [char for sublist in array_unigram for char in sublist]
unigram_bigram = makebigram(array_unigram)
unigram_array_map = frequency_decreasing(array_unigram, unigram_array)
unigram_bigram_map = frequency_decreasing(unigram_bigram, unigram_bigram)

count=0
print("Top 20 frequent unigram tokens")
for element, freq in unigram_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break
print("Top 20 frequent unigram characters")
count=0
for element, freq in unigram_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break
print("Top 20 frequent unigram bigram characters")
count=0
for element, freq in unigram_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent unigram tokens
के: 328232
।: 276509
में: 240924
है: 215430
की: 200481
,: 187662
को: 155131
से: 147523
ने: 127318
का: 126932
.: 126589
और: 115832
पर: 97696
कि: 83341
हैं: 80190
ी: 68624
भी: 65963
कर: 64756
न: 62018
ों: 58423
Top 20 frequent unigram characters
अ: 9533555
के्: 328232
में्: 240924
है्: 215430
की्: 200481
को्: 155131
से्: 147523
ने्: 127318
का्: 126932
और्: 115832
पर्: 97696
कि्: 83341
ई: 81703
हैं्: 80190
भी्: 65963
कर्: 64756
न्: 62018
आ: 60828
ों्: 58423
एक्: 54601
Top 20 frequent unigram bigram characters
के्अ: 328144
अके्: 319878
में्अ: 240848
अमें्: 236115
है्अ: 215244
अहै्: 212427
की्अ: 200345
अकी्: 195729
को्अ: 155082
अको्: 150607
से्अ: 147438
असे्: 143507
ने्अ: 127127
का्अ: 126080
अने्: 122790
अका्: 122166
और्अ: 115817
अऔर्: 111071
पर्अ: 96338
अपर्: 95236


In [33]:
# Generating unigram syllable

unigram_syllable = []
for i in range(len(array_unigram)):
  unigram_syllable.append(make_syllable(array_unigram[i]))

# storing syllable and bigram syllable for unigram token
unigram_syllable_array = [char for sublist in unigram_syllable for char in sublist]
unigram_syllable_bigram = makebigram(unigram_syllable)
unigram_syllable_array_map = frequency_decreasing(unigram_syllable, unigram_syllable_array)
unigram_syllable_bigram_map = frequency_decreasing(unigram_syllable_bigram, unigram_syllable_bigram)

print("Top 20 frequent unigram syllable")
count =0
for element, freq in unigram_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break
print("Top 20 frequent unigram syllable bigram")
count =0
for element, freq in unigram_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent unigram syllable
के: 328101
में: 240827
है: 215240
की: 200325
को: 155018
से: 147428
ने: 127048
का: 126065
और: 115810
पर: 96326
कि: 83187
हैं: 80248
भी: 65950
कर: 63361
न: 60719
ों: 58380
एक: 53760
इस: 50400
लिए: 49269
नहीं: 47367
Top 20 frequent unigram syllable bigram
केलिए: 43527
हैकि: 25213
केसाथ: 18386
कहाकि: 16143
केबाद: 14510
रहाहै: 12017
नेकहा: 11913
हैऔर: 11139
गयाहै: 11052
ोंके: 10859
रहेहैं: 10178
ोंमें: 8908
करनेके: 8850
रहीहै: 8753
ोंको: 8419
जाताहै: 7335
ोंकी: 7247
कियागया: 6892
सकताहै: 6516
नहींहै: 6508


In [22]:
# generating tokens from whitespace tokenizer
vocab = set()
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        tokens = line.strip().split()
        vocab.update(tokens)

def tokenize_with_whitespace(text):
    return text.split()

whitespace_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenize_with_whitespace(line.strip())
        whitespace_tokens.append(line_tokens)

# print("Corpus tokens:", whitespace_tokens)


In [23]:

whitespace_tokens_array = [char for sublist in whitespace_tokens for char in sublist]
whitespace_tokens_map = frequency_decreasing(whitespace_tokens, whitespace_tokens_array)

array_whitespace =[]

# Storing the corrected unicode in array of list
for i in range(len(whitespace_tokens)):
  array_whitespace.append(correction(whitespace_tokens[i]))

# storing characters and character bigram from whitespace tokens
whitespace_array = [char for sublist in array_whitespace for char in sublist]
whitespace_bigram = makebigram(array_whitespace)
whitespace_array_map = frequency_decreasing(array_whitespace, whitespace_array)
whitespace_bigram_map = frequency_decreasing(whitespace_bigram,whitespace_bigram)

print("Top 20 frequent whitespace tokens")
count=0
for element, freq in whitespace_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break
print("Top 20 frequent whitespace characters")
count=0
for element, freq in whitespace_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent whitespace bigram characters")
count=0
for element, freq in whitespace_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent whitespace tokens
के: 316314
में: 237426
की: 189769
को: 145266
से: 136979
और: 114563
का: 109743
ने: 102378
पर: 88560
है।: 88260
कि: 76741
है: 72656
भी: 63920
एक: 49279
लिए: 47955
इस: 47558
कर: 44563
नहीं: 44168
ही: 40104
तो: 33469
Top 20 frequent whitespace characters
अ: 7979923
के्: 316314
में्: 237426
की्: 189769
को्: 145266
से्: 136979
और्: 114563
का्: 109743
ने्: 102378
पर्: 88560
है।्: 88260
कि्: 76741
है्: 72656
भी्: 63920
एक्: 49279
लिए्: 47955
इस्: 47558
कर्: 44563
नहीं्: 44168
ही्: 40104
Top 20 frequent whitespace bigram characters
के्अ: 316314
अके्: 316168
में्अ: 237426
अमें्: 237351
की्अ: 189769
अकी्: 189711
को्अ: 145265
अको्: 145224
से्अ: 136978
असे्: 136926
और्अ: 114562
अऔर्: 113826
का्अ: 109743
अका्: 109702
ने्अ: 102378
अने्: 102357
पर्अ: 88560
अपर्: 88270
अहै।्: 88260
है।्अ: 88259


In [24]:

whitespace_syllable = []
for i in range(len(array_whitespace)):
  whitespace_syllable.append(make_syllable(array_whitespace[i]))

# storing syllable and bigram syllable
whitespace_syllable_array = [char for sublist in whitespace_syllable for char in sublist]
whitespace_syllable_bigram = makebigram(whitespace_syllable)
whitespace_syllable_array_map = frequency_decreasing(whitespace_syllable, whitespace_syllable_array)
whitespace_syllable_bigram_map = frequency_decreasing(whitespace_syllable_bigram, whitespace_syllable_bigram)

print("Top 20 frequent whitespace syllable")
count =0
for element, freq in whitespace_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent whitespace syllable bigram")
count =0
for element, freq in whitespace_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent whitespace syllable
के: 316315
में: 237429
की: 189781
को: 145285
से: 136979
और: 114562
का: 109752
ने: 102378
पर: 88560
है।: 88259
कि: 76742
है: 72651
भी: 63920
एक: 49279
लिए: 47955
इस: 47558
कर: 44563
नहीं: 44168
ही: 40104
तो: 33467
Top 20 frequent whitespace syllable bigram
केलिए: 42449
हैकि: 24818
केसाथ: 17075
कहाकि: 15922
केबाद: 14163
हैऔर: 10176
नेकहा: 9098
करनेके: 8826
बतायाकि: 6393
कोलेकर: 5977
गयाहै।: 5678
रहाहै।: 5483
केखिलाफ: 5312
केदौरान: 5162
केबीच: 5117
बारेमें: 5059
करतेहुए: 4817
रहेहैं।: 4701
मेंभी: 4682
कररहे: 4637


In [37]:
# bpe(1k)

# Loading the pretrained SentencePiece model
spm_tokenizer1 = spm.SentencePieceProcessor()
spm_tokenizer1.load(spm_model_file2 + ".model")

# Tokenizing text using the pretrained SentencePiece model
bpe1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer1.encode_as_pieces(line.strip())
        bpe1_tokens.append(line_tokens)

bpe1_tokens = [[word.replace('▁', '') for word in sublist] for sublist in bpe1_tokens]

# print("Tokens:", bpe1_tokens, "\n")

In [38]:
bpe1_tokens_array = [char for sublist in bpe1_tokens for char in sublist]
bpe1_tokens_map = frequency_decreasing(bpe1_tokens, bpe1_tokens_array)

array_bpe1 =[]

# Storing the corrected unicode in array of list
for i in range(len(bpe1_tokens)):
  array_bpe1.append(correction(bpe1_tokens[i]))
# print(array_bpe1)

# generating characters and bigram characters map
bpe1_array = [char for sublist in array_bpe1 for char in sublist]
bpe1_bigram = makebigram(array_bpe1)
bpe1_array_map = frequency_decreasing(array_bpe1, bpe1_array)
bpe1_bigram_map = frequency_decreasing(bpe1_bigram, bpe1_bigram)

print("Top 20 frequent bpe1 tokens")
count=0
for element, freq in bpe1_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe1 characters")
count =0
for element, freq in bpe1_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe1 bigram characters")
count =0
for element, freq in bpe1_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent bpe1 tokens
के: 344230
।: 276966
में: 245494
है: 215519
की: 214647
,: 189721
ने: 178508
को: 164642
से: 161960
का: 147974
क: 133331
.: 128642
न: 124503
और: 115365
म: 114131
स: 111941
पर: 105525
प: 99336
व: 96389
ल: 95954
Top 20 frequent bpe1 characters
अ: 12231325
के्: 344230
में्: 245494
है्: 215519
की्: 214647
ने्: 178508
को्: 164642
से्: 161960
का्: 147974
क्: 133331
न्: 124503
और्: 115365
म्: 114131
स्: 111941
पर्: 105525
आ: 103570
प्: 99336
व्: 96389
ल्: 95954
र्: 94915
Top 20 frequent bpe1 bigram characters
के्अ: 344183
अके्: 331989
में्अ: 245417
अमें्: 240224
है्अ: 215418
की्अ: 214455
अहै्: 211637
अकी्: 206486
ने्अ: 178474
अने्: 170637
को्अ: 164620
से्अ: 161938
अको्: 157836
असे्: 155233
का्अ: 146247
अका्: 141103
क्अ: 128022
अक्: 119440
न्अ: 117931
और्अ: 115333


In [39]:

bpe1_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_bpe1)):
  bpe1_syllable.append(make_syllable(array_bpe1[i]))
# storing syllable and bigram syllable
bpe1_syllable_array = [char for sublist in bpe1_syllable for char in sublist]
bpe1_syllable_bigram = makebigram(bpe1_syllable)
bpe1_syllable_array_map = frequency_decreasing(bpe1_syllable, bpe1_syllable_array)
bpe1_syllable_bigram_map = frequency_decreasing(bpe1_syllable_bigram, bpe1_syllable_bigram)

print("Top 20 frequent bpe1 syllable")
count =0
for element, freq in bpe1_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe1 syllable bigram")
count =0
for element, freq in bpe1_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent bpe1 syllable
के: 343997
में: 245409
है: 216118
की: 214227
ने: 178414
को: 164568
से: 161904
का: 146105
क: 126772
न: 117230
और: 115327
म: 108427
स: 106006
पर: 104157
कि: 92752
व: 92328
प: 91697
ल: 90946
त: 89628
कर: 87811
Top 20 frequent bpe1 syllable bigram
के: 343997
में: 245409
है: 216118
की: 214227
ने: 178414
को: 164568
से: 161904
का: 146105
क: 126772
न: 117230
और: 115327
म: 108427
स: 106006
पर: 104157
कि: 92752
व: 92328
प: 91697
ल: 90946
त: 89628
कर: 87811


In [40]:
# bpe(2k)

# Loading the pretrained SentencePiece model
spm_tokenizer2 = spm.SentencePieceProcessor()
spm_tokenizer2.load(spm_model_file + ".model")

# Tokenizing text using the pretrained SentencePiece model
bpe2_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer2.encode_as_pieces(line.strip())
        bpe2_tokens.append(line_tokens)

bpe2_tokens = [[word.replace('▁', '') for word in sublist] for sublist in bpe2_tokens]

# print("Tokens:", bpe2_tokens, "\n")

In [41]:
bpe2_tokens_array = [char for sublist in bpe2_tokens for char in sublist]
bpe2_tokens_map = frequency_decreasing(bpe2_tokens, bpe2_tokens_array)

array_bpe2 =[]

# Storing the corrected unicode in array of list
for i in range(len(bpe2_tokens)):
  array_bpe2.append(correction(bpe2_tokens[i]))

# storing character and bigram character map
bpe2_array = [char for sublist in array_bpe2 for char in sublist]
bpe2_bigram = makebigram(array_bpe2)
bpe2_array_map = frequency_decreasing(array_bpe2, bpe2_array)
bpe2_bigram_map = frequency_decreasing(bpe2_bigram, bpe2_bigram)

print("Top 20 frequent bpe2 tokens")
count=0
for element, freq in bpe2_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe2 characters")
count =0
for element, freq in bpe2_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe2 bigram characters")
count =0
for element, freq in bpe2_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent bpe2 tokens
के: 328232
।: 276509
में: 240924
है: 215430
की: 200481
,: 187662
को: 155131
से: 147523
ने: 127318
का: 126932
.: 126589
और: 115832
पर: 97696
कि: 83341
हैं: 80190
ी: 68624
भी: 65963
कर: 64756
न: 62018
ों: 58423
Top 20 frequent bpe2 characters
अ: 9533555
के्: 328232
में्: 240924
है्: 215430
की्: 200481
को्: 155131
से्: 147523
ने्: 127318
का्: 126932
और्: 115832
पर्: 97696
कि्: 83341
ई: 81703
हैं्: 80190
भी्: 65963
कर्: 64756
न्: 62018
आ: 60828
ों्: 58423
एक्: 54601
Top 20 frequent bpe2 bigram characters
के्अ: 328144
अके्: 319878
में्अ: 240848
अमें्: 236115
है्अ: 215244
अहै्: 212427
की्अ: 200345
अकी्: 195729
को्अ: 155082
अको्: 150607
से्अ: 147438
असे्: 143507
ने्अ: 127127
का्अ: 126080
अने्: 122790
अका्: 122166
और्अ: 115817
अऔर्: 111071
पर्अ: 96338
अपर्: 95236


In [42]:
bpe2_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_bpe2)):
  bpe2_syllable.append(make_syllable(array_bpe2[i]))
# storing syllable and bigram syllable map
bpe2_syllable_array = [char for sublist in bpe2_syllable for char in sublist]
bpe2_syllable_bigram = makebigram(bpe2_syllable)
bpe2_syllable_array_map = frequency_decreasing(bpe2_syllable, bpe2_syllable_array)
bpe2_syllable_bigram_map = frequency_decreasing(bpe2_syllable_bigram, bpe2_syllable_bigram)

print("Top 20 frequent bpe2 syllable")
count =0
for element, freq in bpe2_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent bpe2 syllable bigram")
count =0
for element, freq in bpe2_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

Top 20 frequent bpe2 syllable
के: 328101
में: 240827
है: 215240
की: 200325
को: 155018
से: 147428
ने: 127048
का: 126065
और: 115810
पर: 96326
कि: 83187
हैं: 80248
भी: 65950
कर: 63361
न: 60719
ों: 58380
एक: 53760
इस: 50400
लिए: 49269
नहीं: 47367
Top 20 frequent bpe2 syllable bigram
केलिए: 43527
हैकि: 25213
केसाथ: 18386
कहाकि: 16143
केबाद: 14510
रहाहै: 12017
नेकहा: 11913
हैऔर: 11139
गयाहै: 11052
ोंके: 10859
रहेहैं: 10178
ोंमें: 8908
करनेके: 8850
रहीहै: 8753
ोंको: 8419
जाताहै: 7335
ोंकी: 7247
कियागया: 6892
सकताहै: 6516
नहींहै: 6508


In [10]:
#mbert(1k)

# Tokenizing text from the corpus file
mbert1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenizer_mbert1.tokenize(line.strip())
        mbert1_tokens.append(line_tokens)

mbert1_tokens = [[word.replace('#', '') for word in sublist] for sublist in mbert1_tokens]

# print("Tokens:", mbert1_tokens)

NameError: name 'corpus_file' is not defined

In [None]:
mbert1_tokens_array = [char for sublist in mbert1_tokens for char in sublist]
mbert1_tokens_map = frequency_decreasing(mbert1_tokens, mbert1_tokens_array)

array_mbert1 =[]

# Storing the corrected unicode in array of list
for i in range(len(mbert1_tokens)):
  array_mbert1.append(correction(mbert1_tokens[i]))

# storing character and bigram character map
mbert1_array = [char for sublist in array_mbert1 for char in sublist]
mbert1_bigram = makebigram(array_mbert1)
mbert1_array_map = frequency_decreasing(array_mbert1, mbert1_array)
mbert1_bigram_map = frequency_decreasing(mbert1_bigram, mbert1_bigram)

print("Top 20 frequent mbert1 tokens")
count=0
for element, freq in mbert1_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert1 characters")
count =0
for element, freq in mbert1_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert1 bigram characters")
count =0
for element, freq in mbert1_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
mbert1_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_mbert1)):
  mbert1_syllable.append(make_syllable(array_mbert1[i]))

# storing syllable and bigram syllable map
mbert1_syllable_array = [char for sublist in mbert1_syllable for char in sublist]
mbert1_syllable_bigram = makebigram(mbert1_syllable)
mbert1_syllable_array_map = frequency_decreasing(mbert1_syllable, mbert1_syllable_array)
mbert1_syllable_bigram_map = frequency_decreasing(mbert1_syllable_bigram, mbert1_syllable_bigram)

print("Top 20 frequent mbert1 syllable")
count =0
for element, freq in mbert1_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert1 syllable bigram")
count =0
for element, freq in mbert1_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
#mbert(2k)

# Tokenizing text from the corpus file
mbert2_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenizer_mbert2.tokenize(line.strip())
        mbert2_tokens.append(line_tokens)

mbert2_tokens = [[word.replace('#', '') for word in sublist] for sublist in mbert2_tokens]

# print("Tokens:", mbert2_tokens)

In [None]:
mbert2_tokens_array = [char for sublist in mbert2_tokens for char in sublist]
mbert2_tokens_map = frequency_decreasing(mbert2_tokens, mbert2_tokens_array)


array_mbert2 =[]

# Storing the corrected unicode in array of list
for i in range(len(mbert2_tokens)):
  array_mbert2.append(correction(mbert2_tokens[i]))

# storing character and bigram character map
mbert2_array = [char for sublist in array_mbert2 for char in sublist]
mbert2_bigram = makebigram(array_mbert2)
mbert2_array_map = frequency_decreasing(array_mbert2, mbert2_array)
mbert2_bigram_map = frequency_decreasing(mbert2_bigram, mbert2_bigram)

print("Top 20 frequent mbert2 tokens")
count=0
for element, freq in mbert2_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert2 characters")
count =0
for element, freq in mbert2_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert2 bigram characters")
count =0
for element, freq in mbert2_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break


In [None]:
mbert2_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_mbert2)):
  mbert2_syllable.append(make_syllable(array_mbert2[i]))

# storing syllable and bigram syllable map
mbert2_syllable_array = [char for sublist in mbert2_syllable for char in sublist]
mbert2_syllable_bigram = makebigram(mbert2_syllable)
mbert2_syllable_array_map = frequency_decreasing(mbert2_syllable, mbert2_syllable_array)
mbert2_syllable_bigram_map = frequency_decreasing(mbert2_syllable_bigram, mbert2_syllable_bigram)

print("Top 20 frequent mbert2 syllable")
count =0
for element, freq in mbert2_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent mbert2 syllable bigram")
count =0
for element, freq in mbert2_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
#indicbert
import torch
from transformers import AutoModel, AutoTokenizer
indic_model = AutoModel.from_pretrained('ai4bharat/indic-bert')
tokenizer_indicbert1 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert',max_length=1000, truncation=True)
tokenizer_indicbert2 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', max_length=2000, truncation=True)

In [None]:
#indic_bert(1k)

indicbert1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens1 = tokenizer_indicbert1.tokenize(line.strip())
        indicbert1_tokens.append(line_tokens1)

indicbert1_tokens = [[word.replace('_', '') for word in sublist] for sublist in indicbert1_tokens]

# for i in range(500):
#print("Tokenized Output:", indicbert1_tokens[i])

In [None]:
indicbert1_tokens_array = [char for sublist in indicbert1_tokens for char in sublist]
indicbert1_tokens_map = frequency_decreasing(indicbert1_tokens, indicbert1_tokens_array)


array_indicbert1 =[]

# Storing the corrected unicode in array of list
for i in range(len(indicbert1_tokens)):
  array_indicbert1.append(correction(indicbert1_tokens[i]))

# storing character and bigram character map

indicbert1_array = [char for sublist in array_indicbert1 for char in sublist]
indicbert1_bigram = makebigram(array_indicbert1)
indicbert1_array_map = frequency_decreasing(array_indicbert1, indicbert1_array)
indicbert1_bigram_map = frequency_decreasing(indicbert1_bigram, indicbert1_bigram)

print("Top 20 frequent indicbert1 tokens")
count=0
for element, freq in indicbert1_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert1 characters")
count =0
for element, freq in indicbert1_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert1 bigram characters")
count =0
for element, freq in indicbert1_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
indicbert1_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_indicbert1)):
  indicbert1_syllable.append(make_syllable(array_indicbert1[i]))

# storing syllable and bigram syllable map
indicbert1_syllable_array = [char for sublist in indicbert1_syllable for char in sublist]
indicbert1_syllable_bigram = makebigram(indicbert1_syllable)
indicbert1_syllable_array_map = frequency_decreasing(indicbert1_syllable, indicbert1_syllable_array)
indicbert1_syllable_bigram_map = frequency_decreasing(indicbert1_syllable_bigram, indicbert1_syllable_bigram)

print("Top 20 frequent indicbert1 syllable")
count =0
for element, freq in indicbert1_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert1 syllable bigram")
count =0
for element, freq in indicbert1_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
#indic_bert(2k)

indicbert2_tokens = []

with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens2 = tokenizer_indicbert2.tokenize(line.strip())
        indicbert2_tokens.append(line_tokens2)

indicbert2_tokens = [[word.replace('#', '') for word in sublist] for sublist in indicbert2_tokens]


In [None]:
indicbert2_tokens_array = [char for sublist in indicbert2_tokens for char in sublist]
indicbert2_tokens_map = frequency_decreasing(indicbert2_tokens, indicbert2_tokens_array)

array_indicbert2 =[]

# Storing the corrected unicode in array of list
for i in range(len(indicbert2_tokens)):
  array_indicbert2.append(correction(indicbert2_tokens[i]))

# storing character and bigram character map
indicbert2_array = [char for sublist in array_indicbert2 for char in sublist]
indicbert2_bigram = makebigram(array_indicbert2)
indicbert2_array_map = frequency_decreasing(array_indicbert2, indicbert2_array)
indicbert2_bigram_map = frequency_decreasing(indicbert2_bigram, indicbert2_bigram)

print("Top 20 frequent indicbert2 tokens")
count=0
for element, freq in indicbert2_tokens_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert2 characters")
count =0
for element, freq in indicbert2_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert2 bigram characters")
count =0
for element, freq in indicbert2_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

In [None]:
indicbert2_syllable = []

# Storing Syllables in array of list (syllable)
for i in range(len(array_indicbert2)):
  indicbert2_syllable.append(make_syllable(array_indicbert2[i]))

# storing syllable and bigram syllable map
indicbert2_syllable_array = [char for sublist in indicbert2_syllable for char in sublist]
indicbert2_syllable_bigram = makebigram(indicbert2_syllable)
indicbert2_syllable_array_map = frequency_decreasing(indicbert2_syllable, indicbert2_syllable_array)
indicbert2_syllable_bigram_map = frequency_decreasing(indicbert2_syllable_bigram, indicbert2_syllable_bigram)

print("Top 20 frequent indicbert2 syllable")
count =0
for element, freq in indicbert2_syllable_array_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break

print("Top 20 frequent indicbert2 syllable bigram")
count =0
for element, freq in indicbert2_syllable_bigram_map.items():
    print(f"{element}: {freq}")
    count += 1
    if count == 20:
        break