In [None]:
# Cell 1: Import libraries
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import (
    VisionEncoderDecoderModel,
    TrOCRProcessor,
    AutoTokenizer,
    AutoImageProcessor,
    get_linear_schedule_with_warmup,
    PreTrainedTokenizerFast,
)
from datasets import load_dataset
from PIL import Image
import io
from tqdm import tqdm
import os
from typing import Dict, List
import numpy as np

print("‚úÖ All libraries imported")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Cell: Train Thai tokenizer with SentencePiece (no ## prefix)
import sentencepiece as spm
from transformers import PreTrainedTokenizerFast
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers, processors
import os


In [None]:
print("üî® Training Thai tokenizer with SentencePiece...")

# ‡∏ñ‡πâ‡∏≤‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏°‡∏µ corpus ‡πÉ‡∏´‡πâ‡πÇ‡∏´‡∏•‡∏î‡∏Å‡πà‡∏≠‡∏ô
if not os.path.exists('thai_corpus.txt'):
    print("üì¶ Loading Thai corpus...")
    from datasets import load_dataset
    
    # ‡πÇ‡∏´‡∏•‡∏î Thai handwriting dataset
    ds = load_dataset("iapp/thai_handwriting_dataset")["train"]
    
    # ‡∏™‡∏Å‡∏±‡∏î texts
    all_texts = [item['text'] for item in ds if len(item['text']) > 10]
    
    print(f"‚úÖ Collected {len(all_texts)} texts")
    
    # Save
    with open('thai_corpus.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(all_texts))
    
    print("‚úÖ Corpus saved to thai_corpus.txt")
else:
    print("‚úÖ Found existing thai_corpus.txt")

# Train SentencePiece model
print("\nüî® Training SentencePiece model...")
spm.SentencePieceTrainer.train(
    input='thai_corpus.txt',
    model_prefix='thai_sp_30000',
    vocab_size=50000,
    character_coverage=0.9995,
    model_type='unigram',  # ‡∏´‡∏£‡∏∑‡∏≠ 'bpe'
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    pad_piece='[PAD]',
    unk_piece='[UNK]',
    bos_piece='[CLS]',
    eos_piece='[SEP]',
    user_defined_symbols=['[MASK]'],
    normalization_rule_name='identity', # ‡πÑ‡∏°‡πà‡πÅ‡∏õ‡∏•‡∏á case
)

print("‚úÖ SentencePiece model trained")

# ‡πÇ‡∏´‡∏•‡∏î SentencePiece model
sp = spm.SentencePieceProcessor()
sp.load('thai_sp_30000.model')

print(f"üìä Vocabulary size: {sp.vocab_size()}")

# Test
test_text = "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ‡∏Ñ‡∏£‡∏±‡∏ö ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"
tokens = sp.encode_as_pieces(test_text)
ids = sp.encode_as_ids(test_text)
decoded = sp.decode_pieces(tokens)

print(f"\nüìù Test: {test_text}")
print(f"Tokens: {tokens}")
print(f"IDs: {ids}")
print(f"Decoded: {decoded}")

# ‚≠ê ‡πÑ‡∏°‡πà‡∏°‡∏µ ## prefix ‡πÅ‡∏•‡πâ‡∏ß!
print(f"\n‚úÖ No '##' prefix in tokens!")

In [None]:
# Cell: Load your custom tokenizer correctly
import sentencepiece as spm
from transformers import PreTrainedTokenizer

# Define class again (‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ‡πÉ‡∏ô memory)
class SimpleSPTokenizer(PreTrainedTokenizer):
    """Simple wrapper for SentencePiece"""
    
    def __init__(self, sp_model_path, **kwargs):
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)
        
        super().__init__(
            unk_token="<unk>",
            bos_token="<s>",
            eos_token="</s>",
            pad_token="<pad>",
            **kwargs
        )
    
    @property
    def vocab_size(self):
        return self.sp.vocab_size()
    
    def get_vocab(self):
        return {self.sp.id_to_piece(i): i for i in range(self.sp.vocab_size())}
    
    def _tokenize(self, text):
        return self.sp.encode_as_pieces(text)
    
    def _convert_token_to_id(self, token):
        return self.sp.piece_to_id(token)
    
    def _convert_id_to_token(self, index):
        return self.sp.id_to_piece(index)
    
    def convert_tokens_to_string(self, tokens):
        return self.sp.decode_pieces(tokens)
    
    def save_vocabulary(self, save_directory, filename_prefix=None):
        import shutil
        import os
        
        if not os.path.isdir(save_directory):
            os.makedirs(save_directory)
        
        out_file = os.path.join(
            save_directory, 
            (filename_prefix + "-" if filename_prefix else "") + "spm.model"
        )
        
        shutil.copy('thai_sp_30000.model', out_file)
        return (out_file,)

# ‚≠ê ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏î‡∏¢‡∏™‡∏£‡πâ‡∏≤‡∏á instance ‡πÉ‡∏´‡∏°‡πà
your_tokenizer = SimpleSPTokenizer('thai_sp_30000.model')

print("‚úÖ Your tokenizer loaded")

# Test
test_text = "‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ‡∏Ñ‡∏£‡∏±‡∏ö ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢"
your_tokens = your_tokenizer.tokenize(test_text)
your_encoded = your_tokenizer.encode(test_text, add_special_tokens=False)
your_decoded = your_tokenizer.decode(your_encoded)

print(f"Original: '{test_text}'")
print(f"Tokens: {your_tokens}")
print(f"Decoded: '{your_decoded}'")
print(f"Match: {your_decoded.strip() == test_text}")

In [None]:
# Quick test
from transformers import AutoTokenizer

# Your tokenizer
your_tokens = ['‚ñÅ', '‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ', '‡∏Ñ‡∏£‡∏±‡∏ö', '‚ñÅ', '‡∏ó‡∏î‡∏™‡∏≠‡∏ö', '‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢']

# WangchanBERTa
wc_tokenizer = AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
wc_tokens = wc_tokenizer.tokenize("‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ‡∏Ñ‡∏£‡∏±‡∏ö ‡∏ó‡∏î‡∏™‡∏≠‡∏ö‡∏†‡∏≤‡∏©‡∏≤‡πÑ‡∏ó‡∏¢")

print("Your tokenizer:", your_tokens)
print("WangchanBERTa:", wc_tokens)
print(f"\nCleaner: WangchanBERTa" if len(wc_tokens) < len(your_tokens) else "Yours")