In [3]:
import regex as re
from typing import List, Dict, Tuple
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

In [5]:
def get_syllable(label: str, burmese_consonant: str, others: str) -> List[str]:
    """
    Segment a Burmese word into syllables using regex-based rules.
    
    Args:
        label (str): Input Burmese text or word.
        burmese_consonant (str): Regex range for Burmese consonants.
        others (str): Regex range for other characters (vowels, punctuation, etc.).
    
    Returns:
        List[str]: List of syllables.
    """
    # Define regex patterns for Burmese consonants and other characters
    # label = re.sub(r"(?<![္])(["+burmese_consonant+"])(?![်္|့])|(["+others+"])", r" \1\2", label).strip()
    # label = re.sub('(?<=[က-ၴ])([a-zA-Z0-9])', r' \1', label)
    # label = re.sub('([0-9၀-၉])\s+([0-9၀-၉])\s*', r'\1\2 ', label)
    # label = re.sub('([0-9၀-၉])\s+(\+)', r'\1 \2 ', label)
    # label = label.split()
    label = re.sub(r"(?<![္])([" + burmese_consonant + r"])(?![်္|့])|([" + others + r"])", r" \1\2", label).strip()
    label = re.sub(r"(?<=[က-ၴ])([a-zA-Z0-9])", r" \1", label)
    label = re.sub(r"([0-9၀-၉])\s+([0-9၀-၉])\s*", r"\1\2 ", label)
    label = re.sub(r"([0-9၀-၉])\s+(\+)", r"\1 \2 ", label)
    label = label.split()
    
    return label

In [7]:
def syllable_split(label: str) -> List[str]:
    """
    Split Burmese text into syllables, handling spaces and word boundaries.
    
    Args:
        label (str): Input Burmese text.
    
    Returns:
        List[str]: List of syllables.
    """
    burmese_consonant = 'က-အ'
    others = r"ဣဤဥဦဧဩဪဿ၌၍၏၀-၉၊။!-/:-@[-`{-~\s.,"
    
    label_syllable = [get_syllable(s, burmese_consonant, others) + [' '] for s in label.split()]
    return [s for sublist in label_syllable for s in sublist][:-1]

In [11]:
class BurmeseTokenizer:
    def __init__(self, dictionary: Dict[str, str], bpe_vocab_size: int = 10000):
        """
        Initialize the Burmese tokenizer with a root-and-particle dictionary.
        
        Args:
            dictionary (Dict[str, str]): Dictionary mapping words to 'root' or 'particle'.
            bpe_vocab_size (int): Vocabulary size for BPE training.
        """
        self.dictionary = dictionary
        self.bpe_tokenizer = None
        self.bpe_vocab_size = bpe_vocab_size
    
    def segment_syllables(self, text: str) -> List[str]:
        """
        Segment Burmese text into syllables using the provided syllable_split function.
        
        Args:
            text (str): Input Burmese text.
        
        Returns:
            List[str]: List of syllables.
        """
        return syllable_split(text)
    
    def maximum_matching(self, syllables: List[str]) -> List[str]:
        """
        Recombine syllables into root words and particles using maximum matching.
        
        Args:
            syllables (List[str]): List of syllables.
        
        Returns:
            List[str]: List of tokenized root words and particles.
        """
        tokens = []
        i = 0
        max_len = max(len(word) for word in self.dictionary)  # Max word length in dictionary
        
        while i < len(syllables):
            matched = False
            # Try matching longest possible word from current position
            for length in range(max_len, 0, -1):
                if i + length <= len(syllables):
                    candidate = ''.join(syllables[i:i + length])
                    if candidate in self.dictionary:
                        tokens.append(candidate)
                        i += length
                        matched = True
                        break
            if not matched:
                # If no match, treat as single syllable
                tokens.append(syllables[i])
                i += 1
        return tokens
    
    def train_bpe(self, texts: List[str], special_tokens: List[str] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]):
        """
        Train a BPE tokenizer with dictionary constraints on a corpus.
        
        Args:
            texts (List[str]): List of Burmese texts for training.
            special_tokens (List[str]): Special tokens for NLP frameworks.
        """
        # Initialize BPE tokenizer
        self.bpe_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
        
        # Add dictionary words as pre-tokenized units
        self.bpe_tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        
        # Train BPE
        trainer = trainers.BpeTrainer(
            vocab_size=self.bpe_vocab_size,
            special_tokens=special_tokens,
            initial_alphabet=list(self.dictionary.keys())  # Ensure dictionary words are prioritized
        )
        self.bpe_tokenizer.train_from_iterator(texts, trainer)
        
        # Save tokenizer for reuse
        self.bpe_tokenizer.save("burmese_bpe_tokenizer.json")
    
    def tokenize(self, text: str, use_bpe: bool = True) -> Tuple[List[str], List[int]]:
        """
        Tokenize Burmese text into sub-word units.
        
        Args:
            text (str): Input Burmese text.
            use_bpe (bool): Whether to apply BPE after dictionary-based tokenization.
        
        Returns:
            Tuple[List[str], List[int]]: List of tokens and their corresponding IDs.
        """
        # Step 1: Segment into syllables using custom syllable_split
        syllables = self.segment_syllables(text)
        
        # Step 2: Apply maximum matching to get root words and particles
        tokens = self.maximum_matching(syllables)
        
        if use_bpe and self.bpe_tokenizer:
            # Step 3: Apply BPE for sub-word tokenization
            encoded = self.bpe_tokenizer.encode(' '.join(tokens))
            return encoded.tokens, encoded.ids
        else:
            # Return dictionary-based tokens only
            return tokens, list(range(len(tokens)))  # Dummy IDs for non-BPE case

    def decode(self, token_ids: List[int]) -> str:
        """
        Decode token IDs back to Burmese text.
        
        Args:
            token_ids (List[int]): List of token IDs.
        
        Returns:
            str: Decoded Burmese text.
        """
        if self.bpe_tokenizer:
            return self.bpe_tokenizer.decode(token_ids)
        return ''.join(self.dictionary.get(id, '[UNK]') for id in token_ids)
    
# Example Usage
if __name__ == "__main__":
    # Example dictionary (from your previous input)
    dictionary = {
        "ပညာရေး": "root",
        "ပညာ": "root",
        "ဝန်ကြီးဌာန": "root",
        "ဝန်ကြီး": "root",
        "သည်": "particle",
        "အထက်တန်း": "root",
        "အထက်": "root",
        "ကျောင်း": "root",
        "များ": "particle",
        "တွင်": "particle",
        "သင်ကြား": "root",
        "ရေး": "particle",
        "အတွက်": "particle",
        "အထောက်အကူ": "root",
        "ပြု": "root",
        "စာအုပ်": "root",
        "ကို": "particle",
        "ထုတ်ဝေ": "root",
        "ထုတ်": "root",
        "ဝေ": "root",
        "ခဲ့": "particle",
        "။": "punctuation",
        "၊": "punctuation",
    }

    # Initialize tokenizer
    tokenizer = BurmeseTokenizer(dictionary)

    # Example text
    text = "ပညာရေးဝန်ကြီးဌာနသည် အထက်တန်းကျောင်းများတွင် သင်ကြားမှုအတွက် သင်ကြားရေးအထောက်အကူပြု စာအုပ်များကို ထုတ်ဝေခဲ့သည်။"

    # Tokenize without BPE
    tokens, token_ids = tokenizer.tokenize(text, use_bpe=False)
    print("Tokens (no BPE):", tokens)
    print("Token IDs (no BPE):", token_ids)

    # Train BPE on a dummy corpus (replace with your dataset)
    corpus = [text] * 100  # Dummy corpus; use your actual dataset
    tokenizer.train_bpe(corpus)

    # Tokenize with BPE
    tokens, token_ids = tokenizer.tokenize(text, use_bpe=True)
    print("Tokens (with BPE):", tokens)
    print("Token IDs (with BPE):", token_ids)

    # Decode example
    decoded_text = tokenizer.decode(token_ids)
    print("Decoded text:", decoded_text)

Tokens (no BPE): ['ပညာရေး', 'ဝန်ကြီးဌာန', 'သည်', ' ', 'အထက်တန်း', 'ကျောင်း', 'များ', 'တွင်', ' ', 'သင်ကြား', 'မှု', 'အတွက်', ' ', 'သင်ကြား', 'ရေး', 'အထောက်အကူ', 'ပြု', ' ', 'စာအုပ်', 'များ', 'ကို', ' ', 'ထုတ်ဝေ', 'ခဲ့', 'သည်', '။']
Token IDs (no BPE): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]
Tokens (with BPE): ['ပညာ', 'ရေး', 'ဝန်', 'ကြီး', 'ဌာ', 'န', 'သည်', 'အထက်တန်', 'း', 'ကျ', 'ော', 'င်', 'း', 'များ', 'တွင်', 'သင်ကြား', 'မှ', 'ု', 'အတွ', 'က်', 'သင်ကြား', 'ရေး', 'အထော', 'က်အကူ', 'ပြု', 'စာအု', 'ပ်', 'များ', 'ကိ', 'ု', 'ထုတ်ဝေ', 'ခဲ', '့', 'သည်', '။']
Token IDs (with BPE): [66, 51, 69, 81, 59, 13, 46, 84, 28, 55, 48, 36, 28, 50, 82, 52, 67, 23, 71, 38, 52, 51, 83, 80, 97, 92, 64, 50, 53, 23, 95, 56, 27, 46, 35]
Decoded text: ပညာ ရေး ဝန် ကြီး ဌာ န သည် အထက်တန် း ကျ ော င် း များ တွင် သင်ကြား မှ ု အတွ က် သင်ကြား ရေး အထော က်အကူ ပြု စာအု ပ် များ ကိ ု ထုတ်ဝေ ခဲ ့ သည် ။


In [None]:
# langchain integration
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

hf_pipeline = pipeline("text-generation", model="facebook/mbart-large-50", tokenizer=tokenizer.bpe_tokenizer)
llm = HuggingFacePipeline(pipeline=hf_pipeline)
from langchain import PromptTemplate, LLMChain
template = "Answer in Burmese: {question}"
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
response = llm_chain.run("What is the law regarding contracts in Myanmar?")
print(response)

In [None]:
def check_spelling(text: str, tokenizer: BurmeseTokenizer) -> List[Tuple[str, bool]]:
    tokens, _ = tokenizer.tokenize(text, use_bpe=False)
    return [(token, token in tokenizer.dictionary or token in {'[UNK]'}) for token in tokens]

result = check_spelling(text, tokenizer)
for token, is_valid in result:
    print(f"Token: {token}, Valid: {is_valid}")

In [None]:
# HuggingFace integration
from transformers import AutoTokenizer
tokenizer_hf = AutoTokenizer.from_pretrained("burmese_bpe_tokenizer.json")
inputs = tokenizer_hf(text, return_tensors="pt")

In [None]:
# PyTorch integration/ TensorFlow integration
from transformers import AutoTokenizer
tokenizer_hf = AutoTokenizer.from_pretrained("burmese_bpe_tokenizer.json")
inputs = tokenizer_hf(text, return_tensors="pt")

In [None]:
def test_tokenizer():
    tokenizer = BurmeseTokenizer(dictionary)
    text = "ပညာရေးဝန်ကြီးဌာနသည်"
    tokens, _ = tokenizer.tokenize(text, use_bpe=False)
    assert tokens == ['ပညာရေး', 'ဝန်ကြီးဌာန', 'သည်'], "Tokenization failed"
    print("Test passed!")

test_tokenizer()

In [None]:
# Spell check test
text = "ပညာရေးဝန်ကြီးဌာနသည်"  # Add a misspelled word for testing
result = check_spelling(text, tokenizer)
print(result)

In [None]:
# Error handling
import logging
if not matched: 
    logging.warning(f"Unmapped syllable at position {i}: {syllables[i]}")

In [None]:
# batch tokenization
def batch_tokenize(self, texts: List[str], use_bpe: bool = True) -> List[Tuple[List[str], List[int]]]:
    return [self.tokenize(text, use_bpe) for text in texts]