## If you are not using google colab then skip this cell

## Install all Required modules 

In [None]:
!pip install sentencepiece 
!pip install pandas
!pip install transformers
!pip install protobuf


# Answer 1: Unicode correction

In [90]:
# Mapping all consonant characters to their halant form
hindi_consonants_with_halant = {
    'क': 'क्', 'ख': 'ख्', 'ग': 'ग्', 'घ': 'घ्', 'ङ': 'ङ्',
    'च': 'च्', 'छ': 'छ्', 'ज': 'ज्', 'झ': 'झ्', 'ञ': 'ञ्',
    'ट': 'ट्', 'ठ': 'ठ्', 'ड': 'ड्', 'ढ': 'ढ्', 'ण': 'ण्',
    'त': 'त्', 'थ': 'थ्', 'द': 'द्', 'ध': 'ध्', 'न': 'न्',
    'प': 'प्', 'फ': 'फ्', 'ब': 'ब्', 'भ': 'भ्', 'म': 'म्',
    'य': 'य्', 'र': 'र्', 'ल': 'ल्', 'ळ': 'ळ्', 'व': 'व्',
    'श': 'श्', 'ष': 'ष्', 'स': 'स्', 'ह': 'ह् ','क़': 'क़्',
    'ख़': 'ख़्', 'ग़': 'ग़्', 'ज़': 'ज़्', 'ड़': 'ड़्', 'ढ़': 'ढ़्',
    'फ़': 'फ़्', 'य़': 'य़्', ' ':True
}


# Mapping maatras to their corresponding vowels
map_to_vowel = {
    'अ': 'अ', 'आ': 'आ', 'इ': 'इ', 'ई': 'ई', 'उ': 'उ',
    'ऊ': 'ऊ', 'ऋ': 'ऋ', 'ॠ': 'ॠ', 'ऍ': 'ऍ', 'ऎ': 'ऎ',
    'ए': 'ए', 'ऐ': 'ऐ', 'ऑ': 'ऑ', 'ऒ': 'ऒ', 'ओ': 'ओ',
    'औ': 'औ','ा': 'आ', 'ि': 'इ', 'ी': 'ई', 'ु': 'उ', 'ू': 'ऊ',
    'ृ': 'ऋ', 'ॄ': 'ॠ', 'ॅ': 'ऍ', 'ॆ': 'ऎ', 'े': 'ए',
    'ै': 'ऐ', 'ॉ': 'ऑ', 'ॊ': 'ऒ', 'ो': 'ओ', 'ौ': 'औ','ं':'अं','ः':'अः','ँ':'अं','ृ':'र्'
}

# Defining speacial symbols
special_symbols = ['।', '?', ',', ':', ';', '-', '—', '–', '"', "'", '(', ')', '[', ']']


# Function to for unicide correctoin
def transform(sentence):
    sentence_char = []
    
    for i in range(0,len(sentence)):
        char = sentence[i]
        next_char = False

        # if char is space then move to next symbol
        if char == ' ' :
            continue

        # next_char is variable to check if word is ending with full consonant(without matras) or not
        if i<(len(sentence)-1):
            if (sentence[i+1] in special_symbols and sentence[i+1] not in map_to_vowel) or sentence[i+1] in hindi_consonants_with_halant:
                next_char = True


        if char in hindi_consonants_with_halant:
            if i+1 >= len(sentence):
                next_char = True
            sentence_char.append(hindi_consonants_with_halant.get(char))
            if next_char:
                sentence_char.append('अ')

        elif char in map_to_vowel:
            sentence_char.append(map_to_vowel.get(char))
            
    return sentence_char
print(transform(" ऐसी स्थिति में एक न्यायपूर्ण सरकार, सार्वजनिक वित्त का इस तरह इस्तेमाल करती है कि संसाधनों   का आवंटन "))


['ऐ', 'स्', 'ई', 'स्', 'थ्', 'इ', 'त्', 'इ', 'म्', 'ए', 'अं', 'ए', 'क्', 'अ', 'न्', 'य्', 'आ', 'य्', 'अ', 'प्', 'ऊ', 'र्', 'ण्', 'अ', 'स्', 'अ', 'र्', 'अ', 'क्', 'आ', 'र्', 'अ', 'स्', 'आ', 'र्', 'व्', 'अ', 'ज्', 'अ', 'न्', 'इ', 'क्', 'अ', 'व्', 'इ', 'त्', 'त्', 'अ', 'क्', 'आ', 'इ', 'स्', 'अ', 'त्', 'अ', 'र्', 'अ', 'ह् ', 'अ', 'इ', 'स्', 'त्', 'ए', 'म्', 'आ', 'ल्', 'अ', 'क्', 'अ', 'र्', 'अ', 'त्', 'ई', 'ह् ', 'ऐ', 'क्', 'इ', 'स्', 'अं', 'स्', 'आ', 'ध्', 'अ', 'न्', 'ओ', 'अं', 'क्', 'आ', 'आ', 'व्', 'अं', 'ट्', 'अ', 'न्', 'अ']


# Answer 2:Finding characters and syllables

## Function to read files

In [91]:
import pandas as pd

# Function to read data file and create a dataframe 
def read_file(path):
    # Read the text file into a list of lines and remove '\n' from each line
    with open(path, "r",encoding = "UTF8") as file:
        lines = [line.rstrip("\n") for line in file.readlines()]

    # Create a DataFrame from the list of lines
    df = pd.DataFrame(lines, columns=["Text"])
    return df


In [92]:
# reading data corpus of hindi
data = read_file("./data/hi_100.txt")
data.head()

Unnamed: 0,Text
0,"आवेदन करने की आखिरी तारीख 31 जनवरी, 2020 है।"
1,इतनी दुआ कर दो हमारे लिए कि जितना प्यार दुनिया...
2,मोदी सरकार के पहले कार्यकाल में भी तीन तलाक को...
3,भाजपा के दिवंगत नेता प्रमोद महाजन की बेटी पूनम...
4,ऐसी स्थिति में एक न्यायपूर्ण सरकार सार्वजनिक व...


## Let's define some useful functions 

### (A)  Function to sort dictionary based on values

In [93]:
def sort_all(name):
    return dict(sorted(name.items(),key = lambda item:item[1],reverse=True))


### (B) Function to generate csv files , (result parameter)  is a dictionary of dataframes

In [94]:
def generate_files(result,prefix=""):
    filenames = {}
    for key, value in result.items():
        df = pd.DataFrame(value.items(), columns=['item', 'frequency'])
        filename = f"{prefix}_{key}.csv"
        df.to_csv(filename, index=False)
        filenames[key] = filename
    return filenames

### (C) Function to process the words/tokens

In [95]:

def process_tokens(tokens):
  # Defining 4 empty dictionaries
  uni_character_dict = {}
  bi_character_dict = {}
  uni_syllables_dict = {}
  bi_syllables_dict = {}
  # iterarting on each token/words of list of token/words
  for word in tokens:
            # processing each the token/word
            character_list = transform(word)
            # print(character_list)
            uni_syll = ""
            bi_syll  = ""
            bi_char=""
            counter = 0
            char_flag = 0
            # iterating character list
            for char in character_list:
                
                # Forming character count
                char_flag += 1
                bi_char += char
                if char in uni_character_dict:
                    uni_character_dict[char] += 1
                else:
                    uni_character_dict[char] = 1


                if char_flag%2==0 :
                    if bi_char in bi_character_dict:
                        bi_character_dict[bi_char] += 1

                    else:
                        bi_character_dict[bi_char] = 1
                    bi_char=""
                    char_flag = 0

                # forming syllables count
                uni_syll += char
                bi_syll +=  char
                if char in map_to_vowel:
                    if uni_syll in uni_syllables_dict:
                        uni_syllables_dict[uni_syll] += 1

                    else:
                        uni_syllables_dict[uni_syll] = 1
                    counter += 1
                    uni_syll=""
                    if counter == 2:
                        if bi_syll in bi_syllables_dict:
                            bi_syllables_dict[bi_syll] += 1

                            bi_syll = ""
                        else:
                            bi_syllables_dict[bi_syll] = 1
                        counter = 0
                        bi_syll = ""
  return {
        "uni_character": uni_character_dict,
        "bi_character": bi_character_dict,
        "uni_syllables": uni_syllables_dict,
        "bi_syllables": bi_syllables_dict
    }

### (D) Function to process the sentences from whole corpus

In [96]:

def process_corpus(corpus):
    result = {
        "uni_character": {},
        "bi_character": {},
        "uni_syllables": {},
        "bi_syllables": {}
    }

    for sentence in corpus:
        words = sentence.split(" ")
        words_result = process_tokens(words)
        for key, value in words_result.items():
            for k, v in value.items():
                if k in result[key]:
                    result[key][k] += v  # Add frequencies to existing keys
                else:
                    result[key][k] = v   # Initialize if key does not exist
    for key in result:
        result[key] = sort_all(result[key])
    return result


### Warning: Following cell will take 5 to 7 minutes to execute 

In [97]:
# generating characters and syllables for whole data set
result = process_corpus(data["Text"])

In [155]:
# Generating files
generated_files = generate_files(result)
print("Files generated :")
for keys,_ in generated_files.items():
    print(keys)

Files generated :
uni_character
bi_character
uni_syllables
bi_syllables


In [168]:

print("Top 20 freqencies:")
top_20_items_uni_char = dict(list(result['uni_character'].items())[:20])
top_20_items_bi_character = dict(list(result['bi_character'].items())[:20])
top_20_items_uni_syllables = dict(list(result['uni_syllables'].items())[:20])
top_20_items_bi_syllables = dict(list(result['bi_syllables'].items())[:20])
print(f"top_20_items_uni_char: {top_20_items_uni_char}")
print("\n")
print(f"top_20_items_bi_character: {top_20_items_bi_character}")
print("\n")
print(f"top_20_items_uni_syllables: {top_20_items_uni_syllables}")
print("\n")
print(f"top_20_items_bi_syllables: {top_20_items_bi_syllables}")


Top 20 freqencies:
top_20_items_uni_char: {'अ': 6911859, 'आ': 2991109, 'ए': 2318442, 'क्': 2219964, 'र्': 2140164, 'ई': 1460305, 'इ': 1432973, 'न्': 1334448, 'स्': 1283708, 'अं': 1215588, 'ह् ': 1133159, 'म्': 1053237, 'त्': 980066, 'ल्': 919917, 'ओ': 896588, 'प्': 805896, 'य्': 752819, 'व्': 624743, 'द्': 607633, 'उ': 587149}


top_20_items_bi_character: {'र्अ': 784810, 'क्अ': 455914, 'क्ए': 363263, 'न्अ': 351174, 'स्अ': 331978, 'ह् ऐ': 296843, 'प्अ': 278831, 'म्ए': 277981, 'न्ए': 267298, 'ल्अ': 261429, 'क्आ': 261002, 'त्अ': 259598, 'ह् अ': 253367, 'म्अ': 243201, 'क्ई': 224699, 'ब्अ': 218062, 'य्आ': 212870, 'क्इ': 188062, 'क्ओ': 185728, 'स्ए': 184827}


top_20_items_uni_syllables: {'र्अ': 998460, 'क्अ': 565081, 'न्अ': 490374, 'स्अ': 467169, 'क्ए': 403141, 'प्अ': 370202, 'ल्अ': 322731, 'न्ए': 308334, 'क्आ': 304067, 'ह् ऐ': 296989, 'म्ए': 293786, 'म्अ': 285874, 'त्अ': 285450, 'ह् अ': 262318, 'ए': 260373, 'अ': 250340, 'क्ई': 243146, 'ब्अ': 230677, 'य्आ': 223239, 'स्ए': 215169}


top_20_i

# Answer 3: Solved on the mentioned website

# Answer 4 : Unigram, BPE , mBERT , IndicBER and White-space tokenizers

In [102]:
from collections import Counter
import sentencepiece as spm

## (4.1) mBERT and IndicBERT tokenization function

In [103]:
# tokization using mBERT and IndicBERT
def tokenization_step(tokenizer, maxlen, model_name,data = data):
    tokens = []
    token_counts = Counter()  # Counter to keep track of token frequencies

    if model_name == 'IndicBERT':
        for sentence in data["Text"]:
            sentence_tokens = tokenizer.tokenize(sentence)
            tokens += sentence_tokens
            token_counts.update(sentence_tokens)
    elif model_name == 'mBERT':
        for sentence in data["Text"]:
            sentence_tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sentence, truncation=True))
            tokens += sentence_tokens
            token_counts.update(sentence_tokens)

    return tokens, token_counts


## (4.2) Unigram and BPE tokenization function

In [104]:

# tokenization using unigram and BPE
def Unigram_and_BPE_tokenization(model_vocab,data =data):
      all_tokens = []
      # Load trained Hindi Unigram model
      hindi_model = f"{model_vocab}.model"

      hindi_tokenizer = spm.SentencePieceProcessor(model_file=hindi_model)


      # Initialize Counter for token frequencies
      token_counts = Counter()

      # Tokenize each sentence in the corpus and update token frequencies
      for sentence in data["Text"]:
          tokens = hindi_tokenizer.encode_as_pieces(sentence)
          all_tokens += tokens
          token_counts.update(tokens)
      return all_tokens,token_counts




## (4.3)  Whitespace tokenization tokenization function

In [105]:
# tokenization using whitespace tokenizer
def whitespace_tokenizer(data=data):
    # List to store all tokens
    all_tokens = []

    # Iterate through each sentence in the group
    for sentence in data["Text"]:
        # Tokenize the sentence based on white spaces
        tokens = sentence.strip().split()
        # Add the tokens to the list of all tokens
        all_tokens.extend(tokens)

    # Count the frequency of each token
    token_freq = Counter(all_tokens)

    return all_tokens,token_freq

### Loading IndicBERT model

In [106]:
from transformers import AutoModel,AutoTokenizer

In [107]:
Indic_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)

### Tokinizationation using IndicBERT

### warning : it will 5 to 10 minutes to execute

In [None]:
IndicBERT_token_1000,IndicBERT_token_1000_count =  tokenization_step(Indic_tokenizer,1000,'IndicBERT')
IndicBERT_token_2000,IndicBERT_token_2000_count =  tokenization_step(Indic_tokenizer,2000,'IndicBERT')


### Loading mBERT model

In [113]:
from transformers import BertTokenizer, BertModel
mBERT_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')




### Tokenization using mBERT

### warning : it will 5 to 10 minutes to execute

In [114]:
mBERT_token_1000,mBERT_token_1000_count = tokenization_step(mBERT_tokenizer,1000,'mBERT')
mBERT_token_2000,mBERT_token_2000_count = tokenization_step(mBERT_tokenizer,2000,'mBERT')
result_token_count = {
    "mBERT_token_mxln_1000_count":dict(mBERT_token_1000_count),
    "mBERT_token_mxln_2000_count":dict(mBERT_token_2000_count),
    "IndicBERT_maxln_token_1000_count":dict(IndicBERT_token_1000_count),
    "IndicBERT_maxln_token_2000_count":dict(IndicBERT_token_2000_count)


}

### generating  characters and syllables from tokenizes list
### warning : it will 5 to 10 minutes to execute

In [117]:
indiBERT_1000 = process_tokens(IndicBERT_token_1000)
indiBERT_2000 = process_tokens(IndicBERT_token_2000)
mBERT_1000 = process_tokens(mBERT_token_1000)
mBERT_2000 = process_tokens(mBERT_token_2000)



In [118]:
# File generation for IndicBERET and mBERT
generate_files(indiBERT_1000,"indiBERT_mxln_1000")
generate_files(indiBERT_2000,"indiBERT_mxln_2000")
generate_files(mBERT_1000,"mBERT_mxln_1000")
generate_files(mBERT_2000,"mBERT_mxln_2000")
generate_files(result_token_count)
print("All files have Generated !")

All files have Generated !


## Training Unigram model 

### warning : it will approx 10 minutes to execute or you can skip training i have already saved the trained model

In [123]:
import sentencepiece as spm

# Train Unigram model vocab size = 1000
spm.SentencePieceTrainer.train('--input=./data/hi_100.txt --model_prefix=unigram_model_1000_vocab --vocab_size=1000 --model_type=unigram')

# Train Unigram model vocab size = 2000
spm.SentencePieceTrainer.train('--input=./data/hi_100.txt --model_prefix=unigram_model_2000_vocab --vocab_size=2000 --model_type=unigram')

## Training BPE model

### warning : it will approx 10 minutes to execute or you can skip training i have already saved the trained model

In [125]:

# Train SentencePiece tokenizer with BPE
spm.SentencePieceTrainer.train(input="./data/hi_100.txt", model_prefix="BPE_model_1000_vocab", vocab_size=1000, model_type='bpe')
spm.SentencePieceTrainer.train(input="./data/hi_100.txt", model_prefix="BPE_model_2000_vocab", vocab_size=2000, model_type='bpe')


### Tokenization using Unigram and BPE we will use already trained saved models

### warning : it will 5 to 10 minutes to execute

In [126]:

unigram_vocab_1000_token, unigram_vocab_1000_token_freq  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/unigram_model/unigram_model_1000_vocab")
unigram_vocab_2000_token, unigram_vocab_2000_token_freq  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/unigram_model/unigram_model_2000_vocab")

BPE_vocab_1000_token, BPE_vocab_1000_token_freq  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/BPE_model/BPE_model_1000_vocab")
BPE_vocab_2000_token, BPE_vocab_2000_token_freq  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/BPE_model/BPE_model_1000_vocab")

### generating characters and syllables from tokenized list
### warning : it will 5 to 10 minutes to execute

In [128]:
unigram_vocab_1000 = process_tokens(unigram_vocab_1000_token)
unigram_vocab_2000 = process_tokens(unigram_vocab_2000_token)
BPE_vocab_1000 = process_tokens(BPE_vocab_1000_token)
BPE_vocab_2000 = process_tokens(BPE_vocab_2000_token)

In [129]:
result_BPE_uni_token_count = {
    "unigram_vocab_1000_token_freq":dict(unigram_vocab_1000_token_freq),
    "unigram_vocab_2000_token_freq":dict(unigram_vocab_2000_token_freq),
    "BPE_vocab_1000_token_freq":dict(BPE_vocab_1000_token_freq),
    "BPE_vocab_2000_token_freq":dict(BPE_vocab_2000_token_freq)
}


### Tokenization using whitespace 

In [130]:
whitespace_tokenizer_token,whitespace_tokenizer_count = whitespace_tokenizer()

### generating characters and syllables from tokenized list
### warning : it will 5 to 10 minutes to execute

In [131]:
whitespace_token = process_tokens(whitespace_tokenizer_token)

In [134]:
result_whitespcase_token_count = {
    "whitespace_token":dict(whitespace_tokenizer_count)
}

### File generation Unigram , BPE , Whitespace Tokenizer will some  time 

In [133]:
# File generation Unigram , BPE , Whitespace Tokenizer
generate_files(unigram_vocab_1000,"unigram_vocab_1000")
generate_files(unigram_vocab_2000,"unigram_vocab_2000")
generate_files(BPE_vocab_1000,"BPE_vocab_1000")
generate_files(BPE_vocab_2000,"BPE_vocab_2000")
generate_files(result_BPE_uni_token_count)

generate_files(whitespace_token,"whitespace_token")
generate_files(result_whitespcase_token_count)

print("All files have generated succesfully!")

All files have generated succesfully!


# Answer 5 : precision, recall and F-score

## Function to calculate Precision , recall and F-score

In [135]:
def calculate_metrics(actual, predicted):
    # Convert lists to sets
    actual_set = set(actual)
    predicted_set = set(predicted)

    # Calculate true positives, false positives, and false negatives
    true_positives = len(actual_set.intersection(predicted_set))
    false_positives = len(predicted_set.difference(actual_set))
    false_negatives = len(actual_set.difference(predicted_set))

    # Calculate precision, recall, and F1 score
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f1




### Function to clean token

In [136]:
import re
def clean_token(words):
        # Given list of strings
        cleaned_words = []
        # Remove special symbols from each word using regex
        for word in words:
            cleaned_word = ""
            for x in word:
                if (x in hindi_consonants_with_halant) or (x in map_to_vowel):
                    cleaned_word += x
            
            cleaned_words.append(cleaned_word) 
        return  cleaned_words
                    




In [137]:
my_sen = read_file("./data/25_sentence.txt")
Truth_vlaues = read_file("./data/TrueToken.txt")
# Truth_vlaues.to_list()
Truth_value_list = Truth_vlaues['Text'].tolist()
Truth_value_list = [token.strip() for sentence in Truth_value_list for token in sentence.split(',')]
# print(Truth_value_list)


## unigram tokenization

In [138]:

unigram_vocab_1000_token, _  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/unigram_model/unigram_model_1000_vocab",my_sen)
unigram_vocab_2000_token, _  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/unigram_model/unigram_model_2000_vocab",my_sen)

print(f"unigram_vocab_1000_token: {unigram_vocab_1000_token[:10]}")
print(f"unigram_vocab_2000_token: {unigram_vocab_2000_token[:10]}")

unigram_vocab_1000_token: ['▁हालांकि', ',', '▁म', 'ई', '▁2018', '▁में', '▁ए', 'न', 'आ', 'र']
unigram_vocab_2000_token: ['▁हालांकि', ',', '▁म', 'ई', '▁2018', '▁में', '▁एन', 'आर', 'सी', '▁के']


In [87]:

p1,r1,f1 = calculate_metrics(Truth_value_list,unigram_vocab_1000_token)
print("using Unigran with vocab size 1000:")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,unigram_vocab_2000_token)
print("using Unigram with vocab size 2000:")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")



using Unigran with vocab size 1000:
precision = 0.0
recall = 0.0
F-score = 0


using Unigram with vocab size 2000:
precision = 0.0
recall = 0.0
F-score = 0


#### we are getting all values  0 because unigram tokenizer tokenize values are as :
#### unigram_vocab_1000_token: ['▁हालांकि', ',', '▁म', 'ई', '▁2018', '▁में', '▁ए', 'न', 'आ', 'र']
#### unigram_vocab_2000_token: ['▁हालांकि', ',', '▁म', 'ई', '▁2018', '▁में', '▁एन', 'आर', 'सी', '▁के']


#### which is noisy so hardly any token will match with the true value of token that's why we are getting Precision , recall and F-score all as 0.0

### now let's clean all token genereted by Unigram 

In [139]:
unigram_vocab_1000_token_cleaned = clean_token(unigram_vocab_1000_token)
unigram_vocab_2000_token_cleaned = clean_token(unigram_vocab_2000_token)

p1,r1,f1 = calculate_metrics(Truth_value_list,unigram_vocab_1000_token_cleaned)
print("using Unigran with vocab size 1000: For cleaned token")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,unigram_vocab_2000_token_cleaned)
print("using Unigram with vocab size 2000:For cleaned token")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")

using Unigran with vocab size 1000: For cleaned token
precision = 0.13099041533546327
recall = 0.13804713804713806
F-score = 0.13442622950819674


using Unigram with vocab size 2000:For cleaned token
precision = 0.14130434782608695
recall = 0.1750841750841751
F-score = 0.15639097744360902


## BPE tokenization

In [140]:
BPE_vocab_1000_token, _  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/BPE_model/BPE_model_1000_vocab",my_sen)
BPE_vocab_2000_token, _  = Unigram_and_BPE_tokenization("./Q4-Answer files/Trained model/BPE_model/BPE_model_2000_vocab",my_sen)

In [141]:
p1,r1,f1 = calculate_metrics(Truth_value_list,BPE_vocab_1000_token)
print("using BPE with vocab size 1000:")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,BPE_vocab_2000_token)
print("using BPE with vocab size 2000:")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")

using BPE with vocab size 1000:
precision = 0.002336448598130841
recall = 0.003367003367003367
F-score = 0.0027586206896551726


using BPE with vocab size 2000:
precision = 0.0
recall = 0.0
F-score = 0


In [169]:
BPE_vocab_1000_token_cleaned = clean_token(BPE_vocab_1000_token)
BPE_vocab_1000_token_cleaned = clean_token(BPE_vocab_2000_token)

p1,r1,f1 = calculate_metrics(Truth_value_list,BPE_vocab_1000_token_cleaned)
print("using BPE with vocab size 1000: For cleaned token")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,BPE_vocab_1000_token_cleaned)
print("using BPE with vocab size 2000:For cleaned token")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")

using BPE with vocab size 1000: For cleaned token
precision = 0.12596401028277635
recall = 0.16498316498316498
F-score = 0.14285714285714288


using BPE with vocab size 2000:For cleaned token
precision = 0.12596401028277635
recall = 0.16498316498316498
F-score = 0.14285714285714288


## mBERT tokenization

In [142]:
mBERT_token_1000,_ = tokenization_step(mBERT_tokenizer,1000,'mBERT',my_sen)
mBERT_token_2000,_ = tokenization_step(mBERT_tokenizer,2000,'mBERT',my_sen)
p1,r1,f1 = calculate_metrics(Truth_value_list,mBERT_token_1000)
print("using mBERT with maxlength 1000:")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,mBERT_token_2000)
print("using mBERT with maxlength 2000:")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")

using mBERT with maxlength 1000:
precision = 0.10501193317422435
recall = 0.14814814814814814
F-score = 0.12290502793296089


using mBERT with maxlength 2000:
precision = 0.10501193317422435
recall = 0.14814814814814814
F-score = 0.12290502793296089


# IndicBERT tokenization

In [172]:
IndicBERT_token_1000,_ =  tokenization_step(Indic_tokenizer,1000,'IndicBERT',my_sen)
IndicBERT_token_2000,_ =  tokenization_step(Indic_tokenizer,2000,'IndicBERT',my_sen)
p1,r1,f1 = calculate_metrics(Truth_value_list,IndicBERT_token_1000)
print("using IndicBERT with maxlength 1000:")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")
p2,r2,f2 = calculate_metrics(Truth_value_list,IndicBERT_token_2000)
print("using IndicBERT with maxlength 2000:")
print(f"precision = {p2}\nrecall = {r2}\nF-score = {f2}")


using IndicBERT with maxlength 1000:
precision = 0.005277044854881266
recall = 0.006734006734006734
F-score = 0.005917159763313609


using IndicBERT with maxlength 2000:
precision = 0.005277044854881266
recall = 0.006734006734006734
F-score = 0.005917159763313609


## Withespace tokenization

In [173]:
whitespace_tokenizer_token,_ = whitespace_tokenizer(my_sen)


In [174]:
p1,r1,f1 = calculate_metrics(Truth_value_list,whitespace_tokenizer_token)
print("using Whitespace tokenization:")
print(f"precision = {p1}\nrecall = {r1}\nF-score = {f1}")
print("\n")

using Whitespace tokenization:
precision = 0.35174418604651164
recall = 0.4074074074074074
F-score = 0.3775351014040562




# Answer 6

Unigram with Vocab Size 1000 and 2000: Both versions of the unigram tokenization show relatively low precision, recall, and F-score. The precision and recall increase slightly when using a larger vocabulary size.
Unigram tokenization doest not perfoem good because generated tokens are very noisy so calculated values are 0.0
but when we cleaned tokens then results become better.

BPE (Byte Pair Encoding) with Vocab Size 1000 and 2000: BPE tokenization performs poorly in terms of precision, recall, and F-score compared to other methods, with very low values observed for all metrics.

similarly for BPE  tokenization doest not perfoem good because generated tokens are very noisy so calculated values are near to 0.0. but after cleaning results became quite well.

Whitespace Tokenization: Using whitespace tokenization shows higher precision, recall, and F-score compared to other tokenization methods. It achieves the highest precision and recall among all methods.

mBERT with Max Length 1000 and 2000: Both versions of mBERT exhibit similar precision, recall, and F-score. These scores are moderate, falling between those of unigram and whitespace tokenization.

Whitespace tokenization appears to be the most effective method among those tested, with the highest precision, recall, and F-score.
Unigram tokenization performs better than BPE, but still shows lower performance compared to whitespace tokenization
.
mBERT shows moderate performance, but it doesn't seem to improve significantly when increasing the max length from 1000 to 2000
when using IndicBERT with both a maximum length of 1000 and 2000, the precision, recall, and F-score remain the same. This suggests that increasing the maximum length from 1000 to 2000 doesn't lead to any improvement in performance for the given task..
