In [17]:
hindi_vowels = ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः']
hindi_matras = ['ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', 'ौ', 'ं', 'ः', '्', 'ृ']
hindi_punctuation_symbols = "#,।,?!.:;‘’“”-…()▁ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

character_map = {
    'अ': '',
    'आ': 'ा',
    'इ': 'ि',
    'ई': 'ी',
    'उ': 'ु',
    'ऊ': 'ू',
    'ए': 'े',
    'ऐ': 'ै',
    'ओ': 'ो',
    'औ': 'ौ',
    'अं': 'ं',
    'अः': 'ः'
}

# This function is used to extract words
def extract_words(text):
  pattern = r'[\u0900-\u097F]+'
  words = re.findall(pattern, text)
  return words

# Unicode correction function
def correction(s):

  new = []
  halant = '\u094D'
  for i in range(len(s)):

    if(s[i] == '.' or s[i] == ',' or s[i] == '?' or s[i] == '!' or s[i] == ':' or s[i] == ';' or s[i] == '"' or s[i] == "'" or s[i] == '–' or s[i] == '_' or s[i] == '(' or s[i] == ')' or s[i] == '[' or s[i] == ']' or s[i] == '/' or s[i] == '|' or s[i]== ' '):
      continue
    elif(s[i] in hindi_punctuation_symbols):
      continue
    elif(s[i] in hindi_vowels):
      new.append(s[i])
    elif(s[i]=='ा'):
        new.append('आ')
    elif(s[i]=='ि'):
        new.append('इ')
    elif(s[i]=='ी'):
        new.append('ई')
    elif(s[i]=='ु'):
        new.append('उ')
    elif(s[i]=='ू'):
        new.append('ऊ')
    elif(s[i]=='े'):
        new.append('ए')
    elif(s[i]=='ै'):
        new.append('ऐ')
    elif(s[i]=='ो'):
        new.append('ओ')
    elif(s[i]=='ौ'):
        new.append('औ')
    elif(s[i]=='ं'):
        new.append('अं')
    elif(s[i]=='ः'):
        new.append('अः')
    elif(s[i]=='ृ'):
        new.append('र' + halant)
    elif(s[i] == '्'):
      continue
    elif(i+1<len(s) and s[i+1] in hindi_matras):
      new.append(s[i] + halant)
    else:
      new.append(s[i] + halant)
      new.append('अ')
  return new

# Making function to store character in decreasing order of frequency
def frequency_decreasing(array, array_):
  element_freq = Counter(array_)
  sorted_elements = sorted(element_freq.items(), key=lambda x: x[1], reverse=True)
  element_map = {element: freq for element, freq in sorted_elements}
  return element_map

#This function makes bigram
def makebigram(array):
  bigram = []
  for i in range(len(array)):
    for j in range(len(array[i])-1):
      bigram.append(array[i][j] + array[i][j+1])
  return bigram


# This function make syllables
def make_syllable(k):
  new = []
  j=0
  while(j<len(k)):
    if(k[j] in hindi_vowels):
      while(j<len(k) and k[j] in hindi_vowels):
        new.append(k[j])
        j = j+1
    else:
      m = ''
      while(j<len(k) and k[j] not in hindi_vowels):
        m = m + k[j]
        j=j+1
      if(j>=len(k)):
        new.append(m[:-1])
      else:
        new.append(m[:-1] + character_map[k[j]])
      j=j+1
  return new

# This function calculates precision, recall and f1 score
def prf(ground_truth, tokens):

  precisions = []
  recalls = []
  f1_scores = []

  for ground_truth_sublist, model_sublist in zip(ground_truth, tokens):
      ground_truth_set = set(ground_truth_sublist)
      model_set = set(model_sublist)

      true_positives = len(ground_truth_set.intersection(model_set))
      false_positives = len(model_set - ground_truth_set)
      false_negatives = len(ground_truth_set - model_set)

      precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
      recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
      f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
      precisions.append(precision)
      recalls.append(recall)
      f1_scores.append(f1_score)

  average_precision = sum(precisions) / len(precisions)
  average_recall = sum(recalls) / len(recalls)
  average_f1_score = sum(f1_scores) / len(f1_scores)

  return (average_precision, average_recall, average_f1_score)

In [18]:
# Question 4

import sentencepiece as spm
from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer

corpus_file = "ques5.txt"

#Unigram
spm_model_file = "unigram(1).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file, model_type='unigram', vocab_size=200)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ques5.txt
  input_format: 
  model_prefix: unigram(1).model
  model_type: UNIGRAM
  vocab_size: 200
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_lev

In [19]:
#bpe

spm_model_file1 = "bpe(1).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file1, model_type='bpe', vocab_size=1000)
spm_model_file2 = "bpe(2).model"
spm.SentencePieceTrainer.train(input=corpus_file, model_prefix=spm_model_file2, model_type='bpe', vocab_size=2000)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: ques5.txt
  input_format: 
  model_prefix: bpe(1).model
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privacy_noise_level: 0
 

In [20]:
#mbert

tokenizer_mbert1 = BertTokenizer.from_pretrained('bert-base-multilingual-cased' , max_length=1000, truncation=True)
tokenizer_mbert2 = BertTokenizer.from_pretrained('bert-base-multilingual-cased' , max_length=2000, truncation=True)

660 all=509 active=430 piece=▁भा
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1680 all=489 active=410 piece=▁वक
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1700 all=469 active=390 piece=ओवरऑ
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1720 all=449 active=370 piece=ढ़ने
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1740 all=429 active=350 piece=रिक्
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1760 all=409 active=330 piece=िनों
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1780 all=389 active=310 piece=्णाल
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1800 all=369 active=290 piece=▁आरए
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=0 min_freq=0
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1820 all=349 active=270 piece=▁चौं
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=0 size=1840 all=329 active=250 pie

In [21]:
#indicbert

indic_model = AutoModel.from_pretrained('ai4bharat/indic-bert')
tokenizer_indicbert1 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', max_length=1000, truncation=True)
tokenizer_indicbert2 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', max_length=2000, truncation=True)

In [22]:
#indicbert
import torch
from transformers import AutoModel, AutoTokenizer
indic_model = AutoModel.from_pretrained('ai4bharat/indic-bert')
tokenizer_indicbert1 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert',max_length=1000, truncation=True)
tokenizer_indicbert2 = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', max_length=2000, truncation=True)

In [23]:
# Unigram

#loading the pretrained model
spm_tokenizer = spm.SentencePieceProcessor()
spm_tokenizer.load(spm_model_file + ".model")

unigram_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer.encode_as_pieces(line.strip())
        unigram_tokens.append(line_tokens)

unigram_tokens = [[word.replace('▁', '') for word in sublist] for sublist in unigram_tokens]
# print("Tokens:", unigram_tokens, "\n")

In [24]:
#whitespace
vocab = set()
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        tokens = line.strip().split()
        vocab.update(tokens)

def tokenize_with_whitespace(text):
    return text.split()

whitespace_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenize_with_whitespace(line.strip())
        whitespace_tokens.append(line_tokens)

# print("Corpus tokens:", whitespace_tokens)


In [25]:
# bpe(1k)

spm_tokenizer1 = spm.SentencePieceProcessor()
spm_tokenizer1.load(spm_model_file2 + ".model")

bpe1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer1.encode_as_pieces(line.strip())
        bpe1_tokens.append(line_tokens)

bpe1_tokens = [[word.replace('▁', '') for word in sublist] for sublist in bpe1_tokens]

# print("Tokens:", bpe1_tokens, "\n")

In [26]:
# bpe(2k)

spm_tokenizer2 = spm.SentencePieceProcessor()
spm_tokenizer2.load(spm_model_file + ".model")

bpe2_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = spm_tokenizer2.encode_as_pieces(line.strip())
        bpe2_tokens.append(line_tokens)

bpe2_tokens = [[word.replace('▁', '') for word in sublist] for sublist in bpe2_tokens]

# print("Tokens:", bpe2_tokens, "\n")

In [27]:
#mbert(1k)

mbert1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenizer_mbert1.tokenize(line.strip())
        mbert1_tokens.append(line_tokens)

mbert1_tokens = [[word.replace('#', '') for word in sublist] for sublist in mbert1_tokens]

# print("Tokens:", mbert1_tokens)

In [28]:
#mbert(2k)

mbert2_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens = tokenizer_mbert2.tokenize(line.strip())
        mbert2_tokens.append(line_tokens)

mbert2_tokens = [[word.replace('#', '') for word in sublist] for sublist in mbert2_tokens]

# print("Tokens:", mbert2_tokens)

In [29]:
#indic_bert(1k)

indicbert1_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens1 = tokenizer_indicbert1.tokenize(line.strip())
        indicbert1_tokens.append(line_tokens1)

indicbert1_tokens = [[word.replace('_', '') for word in sublist] for sublist in indicbert1_tokens]


In [30]:
#indic_bert(2k)

indicbert2_tokens = []
with open(corpus_file, "r", encoding="utf-8") as file:
    for line in file:
        line_tokens2 = tokenizer_indicbert2.tokenize(line.strip())
        indicbert2_tokens.append(line_tokens2)

indicbert2_tokens = [[word.replace('#', '') for word in sublist] for sublist in indicbert2_tokens]


In [31]:
ground_truth = [
    ["या हल्ल्यात", "जखमी झालेल्या तरूणीला", "उपचारासाठी", "खाजगी रुग्णालयात", "दाखल करण्यात", "आले आहे."],
    ["लंच के", "ठीक बाद", "पारी के", "89वें ओवर की", "पहली गेंद पर", "पुजारा का", "विकेट गिरा।"],
    ["प्रधानमंत्री ने", "कहा कि", "कानून का शासन सुनिश्चित करने और आम आदमी को न्‍याय दिलाने की", "‘ईश्‍वरीय भूमिका’ के निर्वहन के लिए", "न्‍यायपालिका को", "‘सशक्‍त’ और ‘समर्थ’ बनाना होगा।"],
    ["बाल्को के", "पॉट लाइन", "प्रचालन प्रमुख निकेत श्रीवास्तव", "प्रभारी रूम-टू पवन एम पाटिल", "और", "पॉट लाइन", "अनुरक्षण प्रमुख दुर्गा प्रसाद पांडा ने", "बाल्कोकर्मियों", "और", "ठेका कामगारों को", "सम्मानित किया।"],
    ["वहीं", "चेन्नई में", "पेट्रोल की कीमत में", "18 पैसे प्रति लीटर की कमी हुई है।"],
    ["ऐसे में", "निखिल ने", "भी", "बैटरी बदलने की", "जगह", "नए हैंडसेट की", "खरीद को", "ही", "तरजीह दी।"],
    ["डॉलर की कीमत", "निर्धारण के", "तीन तरीके हैं।"],
    ["दिशानिर्देश के बिना", "ई-रिक्शा चलने की", "अनुमति नहीं।"],
    ["उन्होंने", "सऊदी अरब की नीतियों को लेकर", "कहा कि", "सऊदी अरब की सरकार को", "अगर अपनी तरक़्क़ी और इलाक़े की सुरक्षा है", "तो उसे", "फूट डालने की अपनी नीतियों", "और पड़ोसियों पर हमले की राजनीति को", "छोड़ना होगा।"],
    ["एक्जिट गेट तक आने के बाद", "वापस चले गए", "दोनों भाई।"],
    ["इस ट्रायल में शामिल रहे", "एक वरिष्ठ वैज्ञानिक डॉक्टर आरएस शर्मा का कहना है", "कि यह इंजेक्शन पूरी तरह तैयार है", "केवल ड्रग कंट्रोलर से इसकी अनुमति लिया जाना बाक़ी है।"],
    ["हुसैनाबाद", "एसडीपीओ", "मनोज कुमार महतो", "ने", "बताया कि", "युवक", "के", "लापता होने की", "जानकारी", "परिजनों", "ने", "पुलिस को", "नहीं दी थी", "न", "हीं", "मामला", "दर्ज करवाया था।", "उन्होंने", "कहा कि", "शव", "देखने से", "लगता है", "कि", "हत्या", "कर", "शव", "को", "फेंका गया है।"],
    ["रेड कारपेट पर", "टीवी शो", "'नागिन' की एक्ट्रेस", "मौनी राय", "'बेहद' की अभिनेत्री", "जेनिफर विंगेट", "रश्मि देसाई", "किश्वर मर्चेंट", "काम्या पंजाबी", "देबिना बनर्जी मौजूद रहीं", "और", "अपने", "स्टाइलिश गेटअप से", "मीडिया का", "अटेंशन बटोरा।"],
    ["युधिष्ठिर", "-", "ऐसे कुछ", "हीरे जो", "इस वक्त", "सबसे चमकदार चौंध के साथ", "जगमगा रहे है", "वे हैं", "गिरिराज सिंह", "रजत शर्मा", "बाबुल सुप्रियो", "आदि आदि।"],
    ["उनका यह भी कहना है", "कि ”1856-57 में", "भले नमाज़ पढ़ने के", "सबूत", "न मिले हों", "लेकिन 1949 से", "यहां", "नमाज़ पढ़ी गई है।"],
    ["समझौते की अवधि के दौरान", "आपके द्वारा", "चुने गए", "यूनिट के", "आधार पर", "और इसकी मात्रा क्या है", "उसके आधार पर", "आपको", "ब्याज का", "भुगतान किया जाएगा।"],
    ["आज की", "तारीख है", "10 अगस्त 2018", "तथा आज", "इस वर्ष का", "222वां दिन है।", "इन 222 दिनों में", "प्रारंग", "अपने", "परिवार के", "जौनपुरवासियों तक", "208 लेख", "पहुंचा चुका है", "तथा यह लेख", "209वां लेख होगा।"],
    ["ऐसे में", "खेल के", "चौथे दिन की", "लाइव अपडेट के लिए", "बने रहे", "हमारे साथ", "10:07 pm।"],
    ["उन्होंने कहा", "डॉ राजेन्द्र प्रसाद", "बाबा साहेब भीमराव अंबेडकर", "सरदार पटेल", "मौलाना आजाद", "सुचेता कृपलानी", "और अनेक", "अनगिनत महापुरुषों ने", "प्रत्यक्ष और अप्रत्यक्ष", "योगदान देकर", "ये महान विरासत", "हमें सौंपी हैं।"],
    ["हालांकि", "पुरस्कार राशि देखने और सुनने में काफी कम है", "लेकिन", "स्वप्ना ने इसे लेकर कभी कोई नकारात्मक बात नहीं कही।", "अपने तैयारी के दिनों से ही", "नकारात्मक लोगों से घिरी स्वप्ना के जीवन में अब रोशनी है", "और इसी कारण", "उन्हें जो कुछ मिला", "उससे वह संतुष्ट नजर आ रही हैं।"],
    ["नवजात बच्चे की मौत", "सिविल अस्पताल के डॉक्टरों पर", "लापरवाही का आरोप।"],
    ["बिजली तो है", "लेकिन", "उसके आने-जाने का कोई समय नहीं है।", "गांव में", "आज तक", "स्ट्रीट लाइट नहीं लग पाई।"],
    ["स्वप्ना ने हेप्टाथलन में", "भारत को", "ऐतिहासिक स्वर्ण दिलाते हुए", "एक तरफ", "जहां खुद को साबित किया", "वहीं", "उन लोगों की बातों को", "खारिज कर दिया", "जो", "जकार्ता जाने से पहले", "उन्हें खारिज किया करते थे।"],
    ["ओवरऑल ग्रोथ अच्छी रही है", "और", "7 इंडस्ट्री वर्टिकल में से", "6 में हमने", "सालाना आधार पर", "वृद्धि दर्ज की है।"],
    ["मार्च 10", "इज़ाफा"]
]

In [32]:
# Q.5

unigram_prf = prf(ground_truth, unigram_tokens)
whitespace_prf = prf(ground_truth, whitespace_tokens)
bpe1_prf = prf(ground_truth, bpe1_tokens)
bpe2_prf = prf(ground_truth, bpe2_tokens)
mbert1_prf = prf(ground_truth, mbert1_tokens)
mbert2_prf = prf(ground_truth, mbert2_tokens)
indicbert1_prf = prf(ground_truth, indicbert1_tokens)
indicbert2_prf = prf(ground_truth, indicbert2_tokens)

print("Precision, Recall, F1-score\n")
print("unigram", unigram_prf)
print("whitespace", whitespace_prf)
print("bpe1", bpe1_prf)
print("bpe2", bpe2_prf)
print("mbert1", mbert1_prf)
print("mbert2", mbert2_prf)
print("indicbert1", indicbert1_prf)
print("indicbert2", indicbert2_prf)

Precision, Recall, F1-score

unigram (0.008276351186109074, 0.022991452991452988, 0.01207808642615605)
whitespace (0.0701434187890525, 0.14877700077700076, 0.09294035049869073)
bpe1 (0.06733853286729422, 0.14877700077700076, 0.09013520855437668)
bpe2 (0.008276351186109074, 0.022991452991452988, 0.01207808642615605)
mbert1 (0.019007541831275912, 0.07672183372183372, 0.029850727717042235)
mbert2 (0.019007541831275912, 0.07672183372183372, 0.029850727717042235)
indicbert1 (0.0008695652173913043, 0.0015384615384615385, 0.0011111111111111111)
indicbert2 (0.0008695652173913043, 0.0015384615384615385, 0.0011111111111111111)
