# Demo and test tokenizatoin

## BPE tokenization

Suppose we have train the BPE tokenizer using 

In [5]:
import os
import json
from typing import List, Tuple, Optional, Union
from functools import wraps
import logging

logger = logging.getLogger(__name__)

def write_json(filepath, data):
    with open(filepath, 'w') as f:
        json.dump(data, f)

def _bpe_vocab_update(*args, **kwargs):
    print("BPE vocab update called")
    # Actual implementation here

def _unigram_vocab_update(*args, **kwargs):
    print("Unigram vocab update called")
    # Actual implementation here

def _spm_vocab_update(*args, **kwargs):
    print("SPM vocab update called")
    # Actual implementation here

# Define the decorator
def vocab_update(func):

    @wraps(func)
    def wrapper(*args, **kwargs):
        # Print or log the *args value
        print(f"*args: {args}")
        logger.info(f"*args: {args}")

        # Call the original update_vocab function
        func(*args, **kwargs)
        
        # Then call the appropriate vocab update function
        tokenizer_model = kwargs.get('tokenizer_model')
        
        if tokenizer_model == 'BPE':
            return _bpe_vocab_update(*args, **kwargs)
        elif tokenizer_model == 'UNIGRAM':
            return _unigram_vocab_update(*args, **kwargs)
        elif tokenizer_model == 'SPM':
            return _spm_vocab_update(*args, **kwargs)
        else:
            raise ValueError(f"Unknown tokenizer model: {tokenizer_model}")
    
    return wrapper

@vocab_update
def update_vocab(
        input_dir: str,
        output_dir: str,
        tokenizer_model: str="BPE",
        tokenizer_spm_sublevel: Optional[str]="UNIGRAM",
        vocab_fname: Union[List[str], Tuple[str, ...]]=["vocab.json", "merges.txt"],
        new_special_tokens: Optional[List[str]]=["<BOS>", "<UNK>", "<EOS>", "<MASK>"],
        new_vocab_size: int=5009,
):
    """
    Update the vocabulary file for the model.

    Args:
    - input_dir: str, the model name or path
        the dictionary name that contains the vocabulary files

    - output_dir: str, the output directory

    - tokenizer_model: str, the tokenizer type, default is "BPE"
        Could be "BPE" | "UNIGRAM" | "SPM"
        NOTE: For "SPM", we only consider the SPM-unigram tokenizer for now.

    - tokenizer_spm_sublevel: str, the sub-level of SPM tokenizer, default is "UNIGRAM"
        Could be "UNIGRAM" | "BPE". 
        NOTE: This parameter is only used for SPM tokenizer.

    - vocab_fname: List[str], the vocabulary file name, default is ["vocab.json", "merges.txt"]
        For BPE tokenizer, the vocabulary file is "vocab.json" and "merges.txt"
        For UNIGRAM tokenizer, the vocabulary file is "unigram.json"
        For SPM tokenizer, the vocabulary file is "spm_vocab.model" and "spm_vocab.vocab"
    """
    tokenizer_type =tokenizer_model
    print(f"Updating vocab for tokenizer type: {tokenizer_type}")
    # Additional logic can be added here if needed

# Example calls
update_vocab("input_dir_path", "output_dir_path", tokenizer_model='BPE')
update_vocab("input_dir_path", "output_dir_path", tokenizer_model='UNIGRAM')
update_vocab("input_dir_path", "output_dir_path", tokenizer_model='SPM')

*args: ('input_dir_path', 'output_dir_path')
Updating vocab for tokenizer type: BPE
BPE vocab update called
*args: ('input_dir_path', 'output_dir_path')
Updating vocab for tokenizer type: UNIGRAM
Unigram vocab update called
*args: ('input_dir_path', 'output_dir_path')
Updating vocab for tokenizer type: SPM
SPM vocab update called
