In [None]:
from transformers import AutoTokenizer,CanineTokenizer
import os
import numpy as np
import pandas as pd
import grapheme

In [3]:
def read_data(input_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        input_texts = file.readlines()
        input_texts = [line.strip() for line in input_texts]
    return input_texts

def get_data(path):
    sentences_ls = {}
    files = list_files_in_directory(path)
    for file_path in files:
        sentences = read_data(file_path)
        sentences_ls.update({lang_mapping[file_path.split(".")[-1]]:sentences})
    return sentences_ls

def calculate_tokenizer_parity(input_texts_target,input_text_source,tokenizer):
    tokenizer_parity_ls = []
    for s_a,s_b in zip(input_texts_target,input_text_source):

        tokenized_output_sa = tokenizer(s_a)["input_ids"]
        tokenized_output_sb = tokenizer(s_b)["input_ids"]

        # Calculate the compression ratio
        premium= len(tokenized_output_sa) / len(tokenized_output_sb) if len(tokenized_output_sb) != 0 else float('inf')
        tokenizer_parity_ls.append(premium)
    return np.mean(tokenizer_parity_ls)

def calculate_tokenizer_parity_ours(input_texts_target,input_text_source):
    tokenizer_parity_ls = []
    for s_a,s_b in zip(input_texts_target,input_text_source):

        tokenized_output_sa = list(grapheme.graphemes(s_a))
        tokenized_output_sb = list(grapheme.graphemes(s_b))

        # Calculate the compression ratio
        premium= len(tokenized_output_sa) / len(tokenized_output_sb) if len(tokenized_output_sb) != 0 else float('inf')
        tokenizer_parity_ls.append(premium)
    return np.mean(tokenizer_parity_ls)

def evaluate_compression_ratio(input_texts,tokenizer):
    """
    Evaluates the compression ratio of a tokenizer.

    Args:
    tokenizer (PreTrainedTokenizer): The tokenizer to evaluate.
    text (str): The original text data.

    Returns:
    float: The compression ratio.
    """
    compression_ratio_ls = []
    for text in input_texts:
        # Calculate the size of the original text in characters
        original_size = len(text)

        tokenized_output = tokenizer(text)["input_ids"]

        # Calculate the size of the tokenized data in tokens
        tokenized_size = len(tokenized_output)

        # Calculate the compression ratio
        compression_ratio = original_size / tokenized_size if tokenized_size != 0 else float('inf')
        compression_ratio_ls.append(compression_ratio)
    
    return np.mean(compression_ratio_ls)

def evaluate_compression_ratio_ours(input_texts):
    """
    Evaluates the compression ratio of a tokenizer.

    Args:
    tokenizer (PreTrainedTokenizer): The tokenizer to evaluate.
    text (str): The original text data.

    Returns:
    float: The compression ratio.
    """
    compression_ratio_ls = []
    for text in input_texts:
        # Calculate the size of the original text in characters
        original_size = len(text)

        tokenized_output = list(grapheme.graphemes(text))

        # Calculate the size of the tokenized data in tokens
        tokenized_size = len(tokenized_output)

        # Calculate the compression ratio
        compression_ratio = original_size / tokenized_size if tokenized_size != 0 else float('inf')
        compression_ratio_ls.append(compression_ratio)
    
    return np.mean(compression_ratio_ls)

def list_files_in_directory(path):
    """
    Returns a list of files in the specified directory.
    
    Parameters:
    path (str): The directory path where to list the files.
    
    Returns:
    list: A list of file names in the directory.
    """
    try:
        # List only files, not directories
        files = [os.path.join(path,f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        return files
    except FileNotFoundError:
        print(f"Error: The directory '{path}' does not exist.")
        return []
    except PermissionError:
        print(f"Error: Permission denied for accessing the directory '{path}'.")
        return []

In [4]:
tokenizers = [("google/canine-c",CanineTokenizer),
             ("google/byt5-base",AutoTokenizer)
             ]
lang_mapping = {
    "eng_Latn": "english",
    "hin_Deva" : "hindi",
    "sin_Sinh": "sinhala",
    "tam_Taml": "tamil"
}

In [8]:
data_tokenizer_parity = {
    "name":[],
    "english":[],
    "tamil":[],
    "sinhala":[],
    "hindi":[]
}

data_compression_ratio = {
    "name":[],
    "english":[],
    "tamil":[],
    "sinhala":[],
    "hindi":[]
}

for name,t in tokenizers:
    print(name)
    tokenizer = t.from_pretrained(name,trust_remote_code=True)
    sentences_ls = get_data("./flores/")
    data_compression_ratio["name"].append(name)
    data_tokenizer_parity["name"].append(name)
    print(tokenizer.is_fast)
    for lang,sentences in sentences_ls.items():
        comp_ratio = evaluate_compression_ratio(sentences,tokenizer)
        parity = calculate_tokenizer_parity(sentences,sentences_ls["english"],tokenizer)
        data_compression_ratio[lang].append(round(comp_ratio,2))
        data_tokenizer_parity[lang].append(round(parity,2))

#appending our method as well
sentences_ls = get_data("./flores/")
data_compression_ratio["name"].append("Ours")
data_tokenizer_parity["name"].append("Ours")
for lang,sentences in sentences_ls.items():
    comp_ratio = evaluate_compression_ratio_ours(sentences)
    parity = calculate_tokenizer_parity_ours(sentences,sentences_ls["english"])
    data_compression_ratio[lang].append(round(comp_ratio,2))
    data_tokenizer_parity[lang].append(round(parity,2))

google/canine-c
False
google/byt5-base
False


In [9]:
data_compression_ratio

{'name': ['google/canine-c', 'google/byt5-base', 'Ours'],
 'english': [np.float64(0.98), np.float64(0.99), np.float64(1.0)],
 'tamil': [np.float64(0.99), np.float64(0.37), np.float64(1.55)],
 'sinhala': [np.float64(0.98), np.float64(0.38), np.float64(1.41)],
 'hindi': [np.float64(0.98), np.float64(0.39), np.float64(1.45)]}

In [10]:
df_compression = pd.DataFrame(data=data_compression_ratio)
df_parity = pd.DataFrame(data=data_tokenizer_parity)

In [12]:
df_compression.head(15)

Unnamed: 0,name,english,tamil,sinhala,hindi
0,google/canine-c,0.98,0.99,0.98,0.98
1,google/byt5-base,0.99,0.37,0.38,0.39
2,Ours,1.0,1.55,1.41,1.45


In [13]:
df_compression.to_csv("./results/compression_ratio_byte_level.csv")
df_parity.to_csv("./results/parity_byte_level.csv")