##  Demonstration of Grapheme enabled sub-tokenization

In [1]:
#!pip install gdown

import gdown
import numpy as np
import os

### Let's load the text file (Tamil)

In [2]:
# let's down the Tamil example text file from gdrive
PATH = "./tamil.txt"
if not os.path.isfile("./tamil.txt"):
    path = gdown.download(url="https://drive.google.com/file/d/1-065EvxQepsQWTpDbdAfgPw0kkiwfMcR/view?usp=drive_link", output=PATH, fuzzy=True)
    PATH = path

LANG = "ta"

Downloading...
From: https://drive.google.com/uc?id=1-065EvxQepsQWTpDbdAfgPw0kkiwfMcR
To: /home/sudarsun/github/indic-tokenizer/tamil.txt
100%|██████████| 18.8M/18.8M [00:01<00:00, 10.0MB/s]


In [3]:
# load up the text file.
with open(PATH, "r") as file:
    text = file.read()
    file.close()
    size = len(text)

In [4]:
# view a small piece of the text document.
LEN = 200
pos = np.random.randint(0, size-LEN)
print(text[pos:pos+LEN])

ளுக்கு எதிரே இருக்கிறதைப் பழுதுபார்த்துக் கட்டினார்கள் .
ஷாஜகான் , அல்லி அர்ஜூனா , காதல் கிறுக்கன் படங்களுக்கு பிறகு காணாமல் போன ரிச்சா பலோட் திரும்பி வந்திருக்கிறார் .
மேலும் மிக கஷ்டப்படுபவர்களுக்கு


### Let's encode the complex unicode rendering into singular unicode per grapheme

In [7]:
# Indic Unicode Mapper maps the sequence of unicode that constitute a grapheme 
# into a singular unicode in the 0xE00X range.
from indic_unicode_mapper import IndicUnicodeMapper
mapper = IndicUnicodeMapper()

In [8]:
# encode the graphemes.
encoded_text = mapper.encode(text=text, lang=LANG)

In [9]:
# the encode text will have unicodes in the 0xE0XX range.
pos = np.random.randint(0, len(encoded_text)-LEN)
encoded_text[pos:pos+LEN]

' ஆ\ue09b\ue077 \ue1a6\ue001\ue030\ue116\ue16a த\ue153\ue001\ue15bக\ue0feக\ue0b2 த\ue181\ue172\ue0f7 \ue0f9ட\ue0f7ப\ue084ட\ue0a5 .\n\ue0e8\ue12c\ue001ர \ue19fச\ue15b\ue0e0 \ue0aa\ue028\ue004\ue0a3\ue133 இள\ue0bbல \ue0bd\ue15a\ue15d\ue10e இர\ue0b2\ue0a2\ue10eபர\ue0bd\ue15a\ue15d\ue10e \ue030வ\ue0f7\ue0ea\ue0bd\ue15a\ue15d\ue10e \ue0a3\ue12d\ue0b2த \ue103\ue16a\ue15bய ப\ue06d\ue032\ue0bd\ue15a\ue15d\ue10e \ue034\ue125ய\ue0f7ப\ue084ட \ue1a0\ue030\ue0b2\ue0a3ர\ue0b2\ue0a9ய\ue16a \ue1a5\ue161\ue115\ue116\ue12f\ue0c9த\ue0a5; அ\ue0a3\ue0e0 \ue0bbள\ue10e இ\ue12fப\ue0a5 \ue101ழ\ue10e , அ\ue0a3\ue0e0 அகல\ue101\ue10e உயர\ue101\ue10e \ue0e8\ue12c\ue001ர\ue0b2\ue0a3\ue0e0 \ue0aa\ue028\ue004\ue0a3\ue133க\ue174\ue011\ue004\ue03f ச\ue12d\ue115\ue125 ஐ\ue0c9\ue0a5 \ue101ழ\ue10e .\nஎ\ue0f7\ue0f9'

### Let's now learn a BERT Tokenizer model on the encoded dataset.

In [10]:
# instantiate the BERT WPE tokenizer module.
from tokenizers import BertWordPieceTokenizer

cls_token = "[cls]"
sep_token = "[sep]"
mask_token = "[mask]"
pad_token = "[pad]"
unk_token = "[unk]"
spl_tokens = ["[unk]", "[sep]", "[mask]", "[cls]", "[pad]"]  # special tokens
tokenizer = BertWordPieceTokenizer(clean_text=False, 
                                   handle_chinese_chars=True, 
                                   strip_accents=False,
                                   lowercase=False,
                                   sep_token=sep_token, unk_token=unk_token, 
                                   mask_token=mask_token, cls_token=cls_token, pad_token=pad_token)

# setup the Vocabulary size requirement
VOCAB_SIZE = 3000

In [11]:
# we need a temporary folder to keep the encoded file(s) there.
tmpdir = f"/tmp/mapped-{LANG}"
os.makedirs(tmpdir, exist_ok=True)

with open(tmpdir + "/" + PATH, "w") as ofile:
    ofile.write(encoded_text)
    ofile.close()

# add our encoded text file to the array of paths.
files = [tmpdir + "/" + PATH]

In [12]:
# train the algorithm
tokenizer.train(files=files, vocab_size=VOCAB_SIZE, min_frequency=2,
                limit_alphabet=512, wordpieces_prefix='##',
                special_tokens=spl_tokens)






In [13]:
# save the tokenization model.
# this line should create a file with the name f"{LANG}-vocab.txt"
tokenizer.save_model('.', LANG)
TOKENIZER_MODEL = f"{LANG}-vocab.txt"

#### If you open the text file, you will see a lot of gibberish as the words are in encoded form.  Let's display a human understandable version side-by-side.

In [14]:
# let's load from the model file.
with open(TOKENIZER_MODEL, "r") as vfile:
    lines = vfile.readlines()
    vfile.close()

In [15]:
nlines = len(lines)
print(f"found {nlines} vocab items.")
# we are drawing the lines from the latter half, 
# as the tokens are longer there.
lpos = np.random.randint(0, nlines-5)

print("Encoded Strings".ljust(20,' '), "\tOriginal Strings".ljust(20, ' '))
print("".ljust(20,'='), "\t".ljust(20, '='))
for l in range(lpos, lpos+10):
    print("%s\t%s" % (lines[l].rstrip().ljust(20,' '), mapper.decode(lines[l]).rstrip().ljust(10, ' ')))

found 3000 vocab items.
Encoded Strings      	Original Strings   
த                	முடியாத   
                  	போய்      
##               	##ற்பாடு  
##க                	##யோக    
                	தேர்ந்தெ  
உ               	உள்நாட்டு 
ஐ                 	ஐந்து     
##                	##சாலை    
##கட               	##க்கட    
##ன               	##யானது   


### From v0.0.3, learning the tokenizer model is a part of the package.

The model file is typically fixed as `indic-bert-tokenizer-vocab.txt`.  If you need the human readable version, additionally, `indic-bert-tokenizer-vocab.indic.txt` is also created.

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from indic_bert_tokenizer import IndicBertWordPieceTokenizer
tokenizer = IndicBertWordPieceTokenizer.build_model(files = ['tamil.txt'],
                                                    vocab_size=VOCAB_SIZE,
                                                    model_dir='.',
                                                    human_readable=True)

[2025-06-23 12:27:17,260] [INFO] IndicBERTWPETokenizer.build_model Using temporary directory /tmp/tmpp70cbbph for mapped files.
[2025-06-23 12:27:17,261] [INFO] IndicBERTWPETokenizer.build_model Processing 1 files for vocabulary building.
[2025-06-23 12:27:17,262] [INFO] IndicBERTWPETokenizer.build_model Processing file tamil.txt -> /tmp/tmpp70cbbph/tamil.txt
[2025-06-23 12:27:18,833] [INFO] IndicBERTWPETokenizer.build_model Training tokenizer on 1 files with vocab size 3000 and min frequency 2






[2025-06-23 12:27:29,394] [INFO] IndicBERTWPETokenizer.build_model Saving tokenizer model to ./indic-bert-tokenizer-vocab.txt
[2025-06-23 12:27:29,397] [INFO] IndicBERTWPETokenizer.build_model Creating human readable vocabulary file at ./indic-bert-tokenizer-vocab.indic.txt





Let's display the model files

In [25]:
# let's load from the model file.
with open('indic-bert-tokenizer-vocab.txt', "r") as vfile:
    lines = [line.strip() for line in vfile.readlines()]
    vfile.close()

with open('indic-bert-tokenizer-vocab.indic.txt', "r") as vfile:
    hlines = [line.strip() for line in vfile.readlines()]
    vfile.close()

In [28]:
list(zip(lines[1000:1020], hlines[1000:1020]))

[('ப\ue07b', 'படை'),
 ('ம\ue011க\ue181', 'மக்கள்'),
 ('எ\ue0e0ன', 'என்ன'),
 ('அ\ue16aல\ue0a5', 'அல்லது'),
 ('\ue034\ue125\ue0a3', 'செய்தி'),
 ('அவ\ue0e0', 'அவன்'),
 ('பய', 'பய'),
 ('இ\ue12f\ue0c9த', 'இருந்த'),
 ('##\ue12f\ue011\ue002ற\ue0a5', '##ருக்கிறது'),
 ('இ\ue0c9\ue0a3', 'இந்தி'),
 ('##\ue0e0\ue143', '##ன்றா'),
 ('##\ue084\ue074\ue13c', '##ட்டார்'),
 ('##\ue0a3ப\ue0a3', '##திபதி'),
 ('\ue005\ue084ட', 'கூட்ட'),
 ('\ue004\ue18b', 'குழு'),
 ('அ\ue0a9', 'அதை'),
 ('\ue0f1\ue0a5', 'பொது'),
 ('##\ue12f\ue07bய', '##ருடைய'),
 ('##த\ue161', '##தலை'),
 ('கட\ue0c9த', 'கடந்த')]

### Now, let's tokenize the text using the grapheme enabled WPE tokenizer.

In [37]:
# load the tokenizer model that inherently does the unicode encoding/decoding.
from indic_bert_tokenizer import IndicBertWordPieceTokenizer
tokenizer = IndicBertWordPieceTokenizer(model_path="./indic-bert-tokenizer-vocab.txt") # TOKENIZER_MODEL

In [38]:
# just use a random text here.
test_text = "ஆனால் மொத்த ஊக்கப் பொதியின் சிறிய அளவு வரி வெட்டுக்கள் மற்றும் செலவின அதிகரிப்புக்கள் இணைந்தது அரசாங்கத்தின் பிற்போக்குத்தன்மையை காட்டும் நடவடிக்கை அல்ல"

In [39]:
# encode the input text into token encodings.
toks = tokenizer.encode(test_text)
toks

Encoding(num_tokens=33, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [40]:
# list to token ids.
print(toks.ids)

[3, 889, 2450, 49, 672, 382, 214, 364, 739, 1968, 2402, 1943, 2925, 791, 711, 1395, 387, 410, 1404, 1489, 1234, 795, 1640, 206, 2173, 668, 2630, 478, 1625, 359, 901, 888, 1]


In [41]:
# list the token string (in encoded format)
print(toks.tokens)

['[cls]', 'ஆ\ue0d0\ue16a', '\ue108\ue0b2த', 'ஊ', '##\ue011க', '##\ue0f7', '\ue0f1', '##\ue0a3', '##\ue116\ue0e0', '\ue030\ue144ய', 'அள\ue1a2', 'வ\ue12d', '\ue1a4\ue084\ue077', '##\ue011க\ue181', 'ம\ue153\ue146\ue10e', '\ue034ல', '##\ue1a0', '##ன', 'அ\ue0a3க\ue12d', '##\ue0f7\ue0ea\ue011க\ue181', 'இ\ue092', '##\ue0c9த\ue0a5', 'அர\ue02f\ue028க\ue0b2\ue0a3\ue0e0', '\ue0e8', '##\ue153\ue0f9', '##\ue011\ue004', '##\ue0b2த\ue0e0\ue105', '##\ue11c', '\ue001\ue084\ue077', '##\ue10e', 'நடவ\ue075\ue011\ue008', 'அ\ue16aல', '[sep]']


In [42]:
print([tokenizer.decode_string(tok) for tok in toks.tokens])


['[cls]', 'ஆனால்', 'மொத்த', 'ஊ', '##க்க', '##ப்', 'பொ', '##தி', '##யின்', 'சிறிய', 'அளவு', 'வரி', 'வெட்டு', '##க்கள்', 'மற்றும்', 'செல', '##வி', '##ன', 'அதிகரி', '##ப்புக்கள்', 'இணை', '##ந்தது', 'அரசாங்கத்தின்', 'பி', '##ற்போ', '##க்கு', '##த்தன்மை', '##யை', 'காட்டு', '##ம்', 'நடவடிக்கை', 'அல்ல', '[sep]']


In [43]:
# get the decoded string from the tokenizer ids.
tokenizer.decode(toks.ids)

'ஆனால் மொத்த ஊக்கப் பொதியின் சிறிய அளவு வரி வெட்டுக்கள் மற்றும் செலவின அதிகரிப்புக்கள் இணைந்தது அரசாங்கத்தின் பிற்போக்குத்தன்மையை காட்டும் நடவடிக்கை அல்ல'

In [44]:
# also check if the input and the decoded string are matching
assert(test_text == tokenizer.decode(toks.ids))