In [None]:
pip install pandas sentencepiece tokenizers transformers protobuf datasets

In [None]:
# make sure tokenizers is up to date (0.15.0+) otherwise install/update
# pip install -U tokenizers

In [None]:
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py

In [6]:
import tokenizers
print(f"Tokenizers version: {tokenizers.__version__}")

from tokenizers.implementations import SentencePieceUnigramTokenizer

# checking if there is a from_file method
if hasattr(SentencePieceUnigramTokenizer, 'from_file'):
    print("SentencePieceUnigramTokenizer.from_file() is available.")
else:
    print("SentencePieceUnigramTokenizer.from_file() is not available.")
    # if it is not present getting available methods starting with "from_"
    print("available 'from_...' methods:", [m for m in dir(SentencePieceUnigramTokenizer) if m.startswith('from_')])

# also check the base Tokenizer class maybe from_file there
from tokenizers import Tokenizer
if hasattr(Tokenizer, 'from_file'):
    print("Tokenizer.from_file() is available.")
else:
    print("Tokenizer.from_file() is not available.")

Tokenizers version: 0.21.1
SentencePieceUnigramTokenizer.from_file() is not available.
available 'from_...' methods: ['from_spm']
Tokenizer.from_file() is available.


In [None]:
import pandas as pd
import sentencepiece as spm
from tokenizers import SentencePieceUnigramTokenizer # for .from_spm()
from tokenizers import Tokenizer # for Tokenizer.from_file() if needed (not here)
from transformers import PreTrainedTokenizerFast
from datasets import load_dataset
import os
import tempfile
import traceback # for more detailed error output

# --- Configuration ---
DATASET_NAME = "LocalDoc/AzTC"
TEXT_COLUMN = "text"

# number of rows to train the tokenizer
# If False use the entire dataset
# If a number use the specified number of rows
NUM_TRAINING_SAMPLES = 10000000

TEMP_TRAIN_DATA_FILE_PREFIX = "spm_train_data_"
SPM_MODEL_PREFIX = "aztc_azerbaijani_spm"
VOCAB_SIZE = 32000
CHARACTER_COVERAGE = 0.9995
MODEL_TYPE = "unigram"

# strings we want to use for special tokens
UNK_PIECE_STR = "[UNK]"
PAD_PIECE_STR = "[PAD]"
BOS_PIECE_STR = "[CLS]" # using CLS as BOS
EOS_PIECE_STR = "[SEP]" # using SEP like EOS
MASK_PIECE_STR = "[MASK]" # this is our "extra" special token

# tokens that we explicitly want to add to the dictionary via user_defined_symbols,
# and that are NOT the default unk/pad/bos/eos for SentencePiece.
# SentencePiece will figure out unk/pad/bos/eos via _piece parameters.
SPM_USER_DEFINED_SYMBOLS_ONLY_CUSTOM = MASK_PIECE_STR # Only MASK here

HF_TOKENIZER_OUTPUT_DIR = "./aztc_tokenizer_hf"

temp_train_data_path = None

try:
    # --- 0. Check tokenizers version and method availability (for debugging) ---
    print("--- Tokenizers library information ---")
    import tokenizers as tk_lib # use alias to avoid conflict with variable
    print(f"Tokenizers version: {tk_lib.__version__}")
    if hasattr(SentencePieceUnigramTokenizer, 'from_spm'):
        print("SentencePieceUnigramTokenizer.from_spm() is available.")
    else:
        print("SentencePieceUnigramTokenizer.from_spm() is not available. this could be a problem.")
    print("-------------------------------------------")

    # --- 1.Load AzTC dataset ---
    print(f"Loading dataset {DATASET_NAME}...")
    dataset = load_dataset(DATASET_NAME)
    
    # get train split (usually the main split for such datasets)
    if 'train' in dataset:
        train_data = dataset['train']
    else:
        # If no train split, take the first available one
        available_splits = list(dataset.keys())
        print(f"Available splits: {available_splits}")
        train_data = dataset[available_splits[0]]
    
    print(f"Dataset loaded. Total number of samples: {len(train_data)}")
    
    if TEXT_COLUMN not in train_data.column_names:
        raise ValueError(f"Column '{TEXT_COLUMN}' not found in dataset. Available columns: {train_data.column_names}")

    # --- 2. Prepare training data ---
    print("Preparing texts for training...")
    
    if NUM_TRAINING_SAMPLES is False:
        print("Using entire dataset for tokenizer training")
        texts_to_use = train_data[TEXT_COLUMN]
        samples_count = len(train_data)
    else:
        print(f"Using {NUM_TRAINING_SAMPLES} samples for tokenizer training")
        samples_count = min(NUM_TRAINING_SAMPLES, len(train_data))
        # Take first N samples
        texts_to_use = train_data.select(range(samples_count))[TEXT_COLUMN]
    
    print(f"Number of texts for training: {samples_count}")

    # Create temporary file for SentencePiece training
    fd, temp_train_data_path = tempfile.mkstemp(prefix=TEMP_TRAIN_DATA_FILE_PREFIX, suffix=".txt")
    with os.fdopen(fd, "w", encoding="utf-8") as tmp_file:
        for text in texts_to_use:
            if text and isinstance(text, str) and text.strip(): #  check that text is not empty
                tmp_file.write(text.strip() + "\n")
    
    print(f"Texts written to temporary file: {temp_train_data_path}")

    # --- 3. Train SentencePiece tokenizer ---
    print("\nTraining SentencePiece tokenizer...")

    spm_command_args = [
        f"--input={temp_train_data_path}",
        f"--model_prefix={SPM_MODEL_PREFIX}",
        f"--vocab_size={VOCAB_SIZE}",
        f"--character_coverage={CHARACTER_COVERAGE}",
        f"--model_type={MODEL_TYPE}",
        f"--unk_piece={UNK_PIECE_STR}",
        f"--pad_piece={PAD_PIECE_STR}",
        f"--bos_piece={BOS_PIECE_STR}",
        f"--eos_piece={EOS_PIECE_STR}",
        f"--user_defined_symbols={SPM_USER_DEFINED_SYMBOLS_ONLY_CUSTOM}",
        "--shuffle_input_sentence=true",
        f"--input_sentence_size={min(20000000, samples_count)}", # limit for speed if needed
        "--hard_vocab_limit=false",
    ]
    
    spm_command_str = " ".join(spm_command_args)
    print(f"SPM command: {spm_command_str}")

    spm.SentencePieceTrainer.Train(spm_command_str)
    print(f"SentencePiece model trained and saved with prefix: {SPM_MODEL_PREFIX}")
    spm_model_file = f"{SPM_MODEL_PREFIX}.model"
    if not os.path.exists(spm_model_file):
        raise FileNotFoundError(f"SentencePiece model file {spm_model_file} not found after training.")

    # --- 4. Convert to Hugging Face tokenizer ---
    print("\nConverting to Hugging Face tokenizer...")

    try:
        print(f"Trying via SentencePieceUnigramTokenizer.from_spm('{spm_model_file}')...")
        hf_tokenizer_slow = SentencePieceUnigramTokenizer.from_spm(spm_model_file)
        print("Loaded using SentencePieceUnigramTokenizer.from_spm()")

    except Exception as e_load: # use diferent name for exception variable
        print(f"Error loading via SentencePieceUnigramTokenizer.from_spm(): {e_load}")
        print("If this is AttributeError, make sure the 'tokenizers' version is fresh enough (0.15.0+).")
        traceback.print_exc() 
        raise # re-raise error to stop execution if loading failed

    hf_tokenizer_fast = PreTrainedTokenizerFast(
        tokenizer_object=hf_tokenizer_slow,
        unk_token=UNK_PIECE_STR,
        pad_token=PAD_PIECE_STR,
        cls_token=BOS_PIECE_STR, 
        sep_token=EOS_PIECE_STR, 
        mask_token=MASK_PIECE_STR,
        # additional parameters if needed for our model:
        # model_max_length=512, # if we need to limit default length
        # padding_side='right', # or 'left'
        # truncation_side='right', # or 'left'
    )

    if not os.path.exists(HF_TOKENIZER_OUTPUT_DIR):
        os.makedirs(HF_TOKENIZER_OUTPUT_DIR)

    hf_tokenizer_fast.save_pretrained(HF_TOKENIZER_OUTPUT_DIR)
    print(f"Hugging Face tokenizer saved to: {HF_TOKENIZER_OUTPUT_DIR}")

    # --- 5. Testing ---
    print("\n--- Testing tokenizer ---")
    # load saved hf tokenizer for verification
    tokenizer_test = PreTrainedTokenizerFast.from_pretrained(HF_TOKENIZER_OUTPUT_DIR)

    test_sentences = [
        "Bu Azərbaycan dilində bir cümlədir.",
        "Azərbaycan Respublikasının paytaxtı Bakı şəhəridir.",
    ]

    for sentence in test_sentences:
        encoded = tokenizer_test.encode(sentence)
        tokens = tokenizer_test.convert_ids_to_tokens(encoded)
        print(f"\nOriginal: {sentence}")
        print(f"Encoded IDs: {encoded}")
        print(f"Tokens: {tokens}")
        # print(f"decoded: {tokenizer_test.decode(encoded)}") # decode can behave differently with special tokens

    print(f"\ntokenizer vocabulary contains {tokenizer_test.vocab_size} tokens.")
    print(f"PAD token: '{tokenizer_test.pad_token}' (ID: {tokenizer_test.pad_token_id})")
    print(f"UNK token: '{tokenizer_test.unk_token}' (ID: {tokenizer_test.unk_token_id})")
    print(f"CLS token: '{tokenizer_test.cls_token}' (ID: {tokenizer_test.cls_token_id})")
    print(f"SEP token: '{tokenizer_test.sep_token}' (ID: {tokenizer_test.sep_token_id})")
    print(f"MASK token: '{tokenizer_test.mask_token}' (ID: {tokenizer_test.mask_token_id})")

    # Show examples from original dataset
    print(f"\n--- Examples from AzTC dataset ---")
    sample_texts = train_data.select(range(min(5, len(train_data))))[TEXT_COLUMN]
    for i, text in enumerate(sample_texts):
        print(f"Example {i+1}: {text[:100]}{'...' if len(text) > 100 else ''}")

except Exception as e_main: # name for general exception
    print(f"\nMain script execution error occurred: {e_main}")
    traceback.print_exc()

finally:
    if temp_train_data_path and os.path.exists(temp_train_data_path):
        print(f"\nRemoving temporary file: {temp_train_data_path}")
        os.remove(temp_train_data_path)

print("\nDone!")

In [5]:
import pandas as pd
from transformers import PreTrainedTokenizerFast, XLMRobertaTokenizer, BertTokenizer
from datasets import load_dataset
import numpy as np
import time
from collections import Counter

LOCAL_TOKENIZER_PATH = "./aztc_tokenizer_hf"
DATASET_NAME = "LocalDoc/AzTC"
TEXT_COLUMN = "text"
NUM_TEST_SAMPLES = 1000

def load_tokenizers():
    local_tokenizer = PreTrainedTokenizerFast.from_pretrained(LOCAL_TOKENIZER_PATH)
    xlm_roberta_tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
    mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
    
    return {
        "Local": local_tokenizer,
        "XLM-RoBERTa": xlm_roberta_tokenizer,
        "mBERT": mbert_tokenizer
    }

def test_tokenization_speed(tokenizers, texts):
    results = {}
    
    for name, tokenizer in tokenizers.items():
        start_time = time.time()
        for text in texts:
            tokenizer.encode(text, add_special_tokens=True)
        end_time = time.time()
        results[name] = end_time - start_time
    
    return results

def analyze_tokenization_quality(tokenizers, texts):
    results = {}
    
    for name, tokenizer in tokenizers.items():
        token_counts = []
        unk_counts = []
        
        for text in texts:
            tokens = tokenizer.encode(text, add_special_tokens=True)
            token_counts.append(len(tokens))
            
            token_strings = tokenizer.convert_ids_to_tokens(tokens)
            unk_count = sum(1 for token in token_strings if token in [tokenizer.unk_token, "<unk>", "[UNK]"])
            unk_counts.append(unk_count)
        
        results[name] = {
            "avg_tokens": np.mean(token_counts),
            "avg_unk_tokens": np.mean(unk_counts),
            "unk_percentage": (np.sum(unk_counts) / np.sum(token_counts)) * 100,
            "vocab_size": tokenizer.vocab_size
        }
    
    return results

def detailed_comparison(tokenizers, sample_texts):
    print("=== DETAILED TOKENIZATION COMPARISON ===\n")
    
    for i, text in enumerate(sample_texts[:5]):
        print(f"Sample {i+1}: {text[:100]}...")
        print("-" * 50)
        
        for name, tokenizer in tokenizers.items():
            tokens = tokenizer.encode(text, add_special_tokens=True)
            token_strings = tokenizer.convert_ids_to_tokens(tokens)
            
            print(f"{name}:")
            print(f"  Tokens ({len(tokens)}): {token_strings}")
            print(f"  Token IDs: {tokens}")
        print()

def main():
    print("Loading tokenizers...")
    tokenizers = load_tokenizers()
    
    print("Loading test dataset...")
    dataset = load_dataset(DATASET_NAME)
    if 'train' in dataset:
        test_data = dataset['train']
    else:
        test_data = dataset[list(dataset.keys())[0]]
    
    test_texts = test_data.select(range(min(NUM_TEST_SAMPLES, len(test_data))))[TEXT_COLUMN]
    test_texts = [text for text in test_texts if text and isinstance(text, str) and text.strip()]
    
    print(f"Testing with {len(test_texts)} samples...")
    
    print("\n=== SPEED COMPARISON ===")
    speed_results = test_tokenization_speed(tokenizers, test_texts[:100])
    for name, time_taken in speed_results.items():
        print(f"{name}: {time_taken:.4f} seconds")
    
    print("\n=== QUALITY ANALYSIS ===")
    quality_results = analyze_tokenization_quality(tokenizers, test_texts)
    
    df = pd.DataFrame(quality_results).T
    print(df.round(2))
    
    print("\n=== VOCABULARY EFFICIENCY ===")
    for name, results in quality_results.items():
        efficiency = results['vocab_size'] / results['avg_tokens']
        print(f"{name}: {efficiency:.2f} (vocab_size/avg_tokens)")
    
    detailed_comparison(tokenizers, test_texts)
    
    print("=== AZERBAIJANI SPECIFIC TESTS ===")
    azerbaijani_samples = [
        "Azərbaycan Respublikası Cənubi Qafqazda yerləşən ölkədir.",
        "Bakı şəhəri Azərbaycanın paytaxtı və ən böyük şəhəridir.",
        "Xəzər dənizi Azərbaycanın şərq sərhədini təşkil edir.",
        "Azərbaycan dilində 32 hərf var və latın əlifbasından istifadə olunur.",
        "Naxçıvan Azərbaycanın muxtar respublikasıdır."
    ]
    
    detailed_comparison(tokenizers, azerbaijani_samples)

if __name__ == "__main__":
    main()

Loading tokenizers...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Loading test dataset...


Token indices sequence length is longer than the specified maximum sequence length for this model (572 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (728 > 512). Running this sequence through the model will result in indexing errors


Testing with 1000 samples...

=== SPEED COMPARISON ===
Local: 0.0174 seconds
XLM-RoBERTa: 0.0182 seconds
mBERT: 0.0428 seconds

=== QUALITY ANALYSIS ===
             avg_tokens  avg_unk_tokens  unk_percentage  vocab_size
Local             37.85            0.01            0.02     32000.0
XLM-RoBERTa       50.15            0.00            0.00    250002.0
mBERT             64.46            1.14            1.77    119547.0

=== VOCABULARY EFFICIENCY ===
Local: 845.38 (vocab_size/avg_tokens)
XLM-RoBERTa: 4985.48 (vocab_size/avg_tokens)
mBERT: 1854.45 (vocab_size/avg_tokens)
=== DETAILED TOKENIZATION COMPARISON ===

Sample 1: — Aşağıdakı hallarda işlədilən durğu işarəsi: Nida cümləsinin sonunda....
--------------------------------------------------
Local:
  Tokens (14): ['▁—', '▁Aşağıdakı', '▁hallarda', '▁işlədilən', '▁dur', 'ğu', '▁işarəsi', ':', '▁Ni', 'da', '▁cümləsi', 'nin', '▁sonunda', '.']
  Token IDs: [46, 22805, 1354, 21606, 3765, 5169, 16966, 25, 2162, 21, 26966, 37, 1493, 4]
XLM-