In [1]:
from transformers import AutoTokenizer
from transformers import PreTrainedTokenizer
import tiktoken
import json
import pandas as pd
import numpy as np
import os
import regex as re
from huggingface_hub import notebook_login


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
tokenizers = [
#     ("gpt2"),
#     ("meta-llama/Llama-2-7b-chat-hf"),
# #     ("meta-llama/Meta-Llama-3.1-8B-Instruct"),
#     ("google/flan-t5-base"),
#     ("google/gemma-2-2b-it")
    ("CohereForAI/aya-101"),
#     ("bigscience/bloom-560m"),
#     ("bigscience/bloomz"),
#     ("abhinand/tamil-llama-7b-instruct-v0.1"),
#     ("aisingapore/sea-lion-7b-instruct"),
#     ("google-bert/bert-base-uncased"),
    ("google-bert/bert-base-multilingual-uncased"),
#     ("google-t5/t5-base"),
    ("google/mt5-base"),
    ("facebook/mbart-large-50"),
    ("facebook/nllb-200-distilled-600M"),

]
lang_mapping = {
    "eng_Latn": "english",
    "hin_Deva" : "hindi",
    "sin_Sinh": "sinhala",
    "tam_Taml": "tamil"
}
tokenizers_openai = [
    ("gpt-4o-","o200k_base"),
    ("gpt-4-","cl100k_base")
]

In [6]:
def read_data(input_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        input_texts = file.readlines()
        input_texts = [line.strip() for line in input_texts]
    return input_texts

def extract_text_chunks(text, index_pairs):
    """
    Extract text chunks from the input text based on the given index pairs.

    :param text: The input string from which to extract chunks.
    :param index_pairs: A list of tuples, where each tuple contains a pair of start and end indices.
    :return: A list of extracted text chunks.
    """
    text_chunks = [text[start:end] for start, end in index_pairs]
    return text_chunks

def list_files_in_directory(path):
    """
    Returns a list of files in the specified directory.
    
    Parameters:
    path (str): The directory path where to list the files.
    
    Returns:
    list: A list of file names in the directory.
    """
    try:
        # List only files, not directories
        files = [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        return files
    except FileNotFoundError:
        print(f"Error: The directory '{path}' does not exist.")
        return []
    except PermissionError:
        print(f"Error: Permission denied for accessing the directory '{path}'.")
        return []

def get_data(path):
    sentences_ls = {}
    files = list_files_in_directory(path)
    for file_path in files:
        sentences = read_data(file_path)
        sentences_ls.update({lang_mapping[file_path.split(".")[-1]]:sentences})
    return sentences_ls
    
def set_unknown_token(tokenizer: PreTrainedTokenizer, unknown_token: str = '<unk>'):
    """
    Checks if the given tokenizer has None for the unknown token.
    If it does, sets the unknown token and its ID.

    Args:
    tokenizer (PreTrainedTokenizer): The tokenizer to check and update.
    unknown_token (str): The token to use as the unknown token. Default is '<unk>'.

    Returns:
    None
    """
    if tokenizer.unk_token is None:
        # Set the unknown token
        tokenizer.unk_token = unknown_token
        
        # Add the unknown token to the vocabulary if it's not already there
        if unknown_token not in tokenizer.vocab:
            tokenizer.add_tokens([unknown_token])
        
        # Set the unknown token ID
        tokenizer.unk_token_id = tokenizer.convert_tokens_to_ids(unknown_token)
        
        print(f"Unknown token set to: {tokenizer.unk_token}")
        print(f"Unknown token ID set to: {tokenizer.unk_token_id}")
    else:
        print(f"Tokenizer already has an unknown token: {tokenizer.unk_token}")
        print(f"Unknown token ID: {tokenizer.unk_token_id}")
    return tokenizer



def fertility(input_texts,tokenizer):
    text_fertility = []
    for text in input_texts:
        tokenized_len = len(tokenizer.tokenize(text))
        word_count = len(text.split())
        text_fertility.append(tokenized_len/word_count)
    return np.mean(text_fertility)
        
def proportion_of_continued_words(input_texts,tokenizer): 
    continued_words = []
    for text in input_texts:
        words = text.split()
        words_count = len(words)
        continued_count = 0
        for word in words:
            tokenized_word = tokenizer.tokenize(word)
            if len(tokenized_word)>1:
                continued_count+=1
        continued_words.append((continued_count/words_count))
    return np.mean(continued_words)

def unkown_rate(input_texts,tokenizer):
    unknw_rate = []
    for text in input_texts:
        tokenized_words = tokenizer.tokenize(text)
        tokenized_len = len(tokenized_words)
        unkn_token_count = len(list(filter(lambda x:x==tokenizer.unk_token,tokenized_words )))
        unknw_rate.append(unkn_token_count/tokenized_len)
    return np.mean(unknw_rate)   


def closeness(input_texts,tokenizer):
    closeness = []
    for text in input_texts:
        tokenized_len = len(tokenizer.tokenize(text))
        character_count = len(text)
        closeness.append(tokenized_len/character_count)
    return np.mean(closeness)

def evaluate_compression_ratio(input_texts,pre_tokenizer):
    """
    Evaluates the compression ratio of a tokenizer.

    Args:
    tokenizer (PreTrainedTokenizer): The tokenizer to evaluate.
    text (str): The original text data.

    Returns:
    float: The compression ratio.
    """
    compression_ratio_ls = []
    for text in input_texts:
        # Calculate the size of the original text in characters
        original_size = len(text)

        pretokenized_output = pre_tokenizer.pre_tokenize_str(text)
        index_pairs = [ index_pair for pretokens,index_pair in pretokenized_output]
        pretokens = extract_text_chunks(text, index_pairs)

        # Calculate the size of the tokenized data in tokens
        tokenized_size = len(pretokens)

        # Calculate the compression ratio
        compression_ratio = original_size / tokenized_size if tokenized_size != 0 else float('inf')
        compression_ratio_ls.append(compression_ratio)
    
    return np.mean(compression_ratio_ls)

def evaluate_compression_ratio_openai(input_texts,GPT4_SPLIT_PATTERN):
    """
    Evaluates the compression ratio of a tokenizer.

    Args:
    tokenizer (PreTrainedTokenizer): The tokenizer to evaluate.
    text (str): The original text data.

    Returns:
    float: The compression ratio.
    """
    compression_ratio_ls = []
    for text in input_texts:
        # Calculate the size of the original text in characters
        original_size = len(text)

        # Tokenize the text
        pretokens = re.findall(GPT4_SPLIT_PATTERN, text)

        # Calculate the size of the tokenized data in tokens
        tokenized_size = len(pretokens)

        # Calculate the compression ratio
        compression_ratio = original_size / tokenized_size if tokenized_size != 0 else float('inf')
        compression_ratio_ls.append(compression_ratio)
    
    return np.mean(compression_ratio_ls)

def evaluate_context_window(input_texts,pre_tokenizer):
    context_window_ls = []
    for text in input_texts:
        pretokenized_output = pre_tokenizer.pre_tokenize_str(text)
        index_pairs = [ index_pair for pretokens,index_pair in pretokenized_output]
        pretokens = extract_text_chunks(text, index_pairs)
        context_window_ls.append(len(pretokens))
    return np.mean(context_window_ls)

def evaluate_context_window_openai(input_texts,GPT4_SPLIT_PATTERN):
    context_window_ls = []
    for text in input_texts:
        context_window_ls.append(len(re.findall(GPT4_SPLIT_PATTERN, text)))
    return np.mean(context_window_ls)

def calculate_tokenizer_parity(input_texts_target,input_text_source,pre_tokenizer):
    tokenizer_parity_ls = []
    for s_a,s_b in zip(input_texts_target,input_text_source):

        pretokenized_output_sa = pre_tokenizer.pre_tokenize_str(s_a)
        pretokenized_output_sb = pre_tokenizer.pre_tokenize_str(s_b)

        index_pairs_sa = [ index_pair for pretokens,index_pair in pretokenized_output_sa]
        index_pairs_sb = [ index_pair for pretokens,index_pair in pretokenized_output_sb]

        pretokens_sa = extract_text_chunks(s_a, index_pairs_sa)
        pretokens_sb = extract_text_chunks(s_b, index_pairs_sb)

        # Calculate the compression ratio
        premium= len(pretokens_sa) / len(pretokens_sb) if len(pretokens_sb) != 0 else float('inf')
        tokenizer_parity_ls.append(premium)
    return np.mean(tokenizer_parity_ls) 

def calculate_tokenizer_openai(input_texts_target,input_text_source,GPT4_SPLIT_PATTERN):
    tokenizer_parity_ls = []
    for s_a,s_b in zip(input_texts_target,input_text_source):

        pretokens_sa = re.findall(GPT4_SPLIT_PATTERN, s_a)
        pretokens_sb = re.findall(GPT4_SPLIT_PATTERN, s_b)

        # Calculate the compression ratio
        premium= len(pretokens_sa) / len(pretokens_sb) if len(pretokens_sb) != 0 else float('inf')
        tokenizer_parity_ls.append(premium)
    return np.mean(tokenizer_parity_ls) 

In [5]:
data_tokenizer_parity = {
    "name":[],
    "english":[],
    "tamil":[],
    "sinhala":[],
    "hindi":[]
}

data_compression_ratio = {
    "name":[],
    "english":[],
    "tamil":[],
    "sinhala":[],
    "hindi":[]
}
for name in tokenizers:
    print(name)
    tokenizer = AutoTokenizer.from_pretrained(name,trust_remote_code=True)
    sentences_ls = get_data("./flores/")
    data_compression_ratio["name"].append(name)
    data_tokenizer_parity["name"].append(name)
    print(tokenizer.is_fast)
    pre_tokenizer = tokenizer.backend_tokenizer.pre_tokenizer
    for lang,sentences in sentences_ls.items():
        comp_ratio = evaluate_compression_ratio(sentences,pre_tokenizer)
        parity = calculate_tokenizer_parity(sentences,sentences_ls["english"],pre_tokenizer)
        data_compression_ratio[lang].append(round(comp_ratio,2))
        data_tokenizer_parity[lang].append(round(parity,2))
        pre_tokenizer = None

# for name,path in tokenizers_openai:
#     print(name)
#     tokenizer = tiktoken.encoding_for_model(name)
#     sentences_ls = get_data("./flores/")
#     data_compression_ratio["name"].append(name)
#     data_context_window["name"].append(name)
#     for lang,sentences in sentences_ls:
#         comp_ratio = evaluate_compression_ratio_openai(sentences,tokenizer)
#         context_window_len = evaluate_context_window_openai(sentences,tokenizer)
#         data_compression_ratio[lang].append(comp_ratio)
#         data_context_window[lang].append(context_window_len)

# GPT4_SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
# data_compression_ratio["name"].append("GPT4")
# data_tokenizer_parity["name"].append("GPT4")
# sentences_ls = get_data("./flores/")
# for lang,sentences in sentences_ls.items():
#     comp_ratio = evaluate_compression_ratio_openai(sentences,GPT4_SPLIT_PATTERN)
#     parity = calculate_tokenizer_openai(sentences,sentences_ls["english"],GPT4_SPLIT_PATTERN)
#     data_compression_ratio[lang].append(round(comp_ratio,2))
#     data_tokenizer_parity[lang].append(round(parity,2))
    


CohereForAI/aya-101
True
google-bert/bert-base-multilingual-uncased




True
google/mt5-base


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


True
facebook/mbart-large-50
True
facebook/nllb-200-distilled-600M
True


In [None]:
len(data_compression_ratio["name"])

In [6]:
df_compression = pd.DataFrame(data=data_compression_ratio)
df_parity = pd.DataFrame(data=data_tokenizer_parity)

In [7]:
df_parity.head(15)

Unnamed: 0,name,english,tamil,sinhala,hindi
0,CohereForAI/aya-101,1.0,0.78,0.96,1.18
1,google-bert/bert-base-multilingual-uncased,1.0,0.8,0.93,1.13
2,google/mt5-base,1.0,0.78,0.96,1.18
3,facebook/mbart-large-50,1.0,0.78,0.96,1.18
4,facebook/nllb-200-distilled-600M,1.0,0.78,0.96,1.18


In [8]:
df_compression.to_csv("./results/compression_ratio_multilingual_models.csv")
df_parity.to_csv("./results/parity_multilingual_models.csv")

In [28]:
tokenizer = AutoTokenizer.from_pretrained("CohereForAI/aya-101",trust_remote_code=True)

In [29]:
tokenizer("හෙලෝ වර්ල්ඩ්")["input_ids"]

[31114, 71105, 51190, 4858, 30038, 1]

In [30]:
tokenizer.decode(tokenizer("හෙලෝ වර්ල්ඩ්")["input_ids"])

'හෙලෝ වර්ල්ඩ්</s>'

In [31]:
for token in tokenizer("හෙලෝ වර්ල්ඩ්")["input_ids"]:
    print(tokenizer.decode(token))

හෙ
ලෝ
වර්
ල්
ඩ්
</s>


In [32]:
sentence = "හෙලෝ වර්ල්ඩ්!"
pre_tokenizer = tokenizer.backend_tokenizer.pre_tokenizer
pretokenized_output = pre_tokenizer.pre_tokenize_str(sentence)
index_pairs = [ index_pair for pretokens,index_pair in pretokenized_output]
pretokens = extract_text_chunks(sentence, index_pairs)

In [33]:
pretokens

['හෙලෝ', ' වර්ල්ඩ්!']

In [None]:
"""ක්‍රීඩා

&#x0D9A;&#x0DCA;&#x200D;&#x0DBB;&#x0DD3;&#x0DA9;&#x0DCF;"""