In [1]:
from tokenizers import SentencePieceBPETokenizer
dataset = "wikipedia"
with open(f"./{dataset}_strokes.txt", "r", encoding="utf-8") as f:
    text = f.readlines()

In [10]:
tokenizer = SentencePieceBPETokenizer()
vocab_sizes = [2000, 3000, 4000]
for vocab_size in vocab_sizes:
    print(vocab_size)
    tokenizer.train_from_iterator(
        text,
        vocab_size=vocab_size,
        min_frequency=5,
        show_progress=True,
        limit_alphabet=500,
    )
    tokenizer.save(f"{dataset}/SentencePiece_{vocab_size}.json")

2000
3000
4000


In [11]:
from tokenizers import Tokenizer
file = f"{dataset}/SentencePiece_2000.json"
bpe_tokenizer = Tokenizer.from_file(file)
stroke_text = 'terduto eaieeeatneaseear tn2'
tokens = bpe_tokenizer.encode(stroke_text)
tokens.tokens

['▁terduto', '▁eaieeeatn', 'easeear', '▁tn2']

In [12]:
from collections import defaultdict
# Create dict for text into strokes translation and vice versa
with open("zh2letter.txt", 'r', encoding="utf-8") as f:
    conversions = f.read()

conversions = conversions.splitlines()
zh2letter = defaultdict(str)
letter2zh = defaultdict(str)
for line in conversions:
    chinese_char, strokes = line.split()
    zh2letter[chinese_char] = strokes
    letter2zh[strokes] = chinese_char

In [16]:
letter2zh["eaieeea"]

'車'

In [31]:
stroke_text = 'gteeaeeasoooo gteeaeeasoooo gtetbat gteeeatn0 gteoetodteetn'
tokens = bpe_tokenizer.encode(stroke_text)
tokens.tokens

['▁gteeaeeasoooo',
 '▁gteeaeeasoooo',
 '▁gtet',
 'b',
 'at',
 '▁gteeeatn0',
 '▁gteo',
 'eto',
 'd',
 'teetn']

In [40]:
for token in tokens.tokens:
    if token[0] == '▁':
        token = token[1:]    
    print(token, letter2zh[token])

gteeaeeasoooo 媽
gteeaeeasoooo 媽
gtet []
b []
at []
gteeeatn0 妹
gteo []
eto []
d []
teetn []


In [42]:
letter2zh["eaeeasoooo"]

'馬'

Train from T5 tokenizer

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [21]:
new_tokenizer = tokenizer.train_new_from_iterator(text, 30000)

In [22]:
new_tokenizer.encode(text[0])

[179,
 6,
 8,
 35,
 3,
 25,
 334,
 320,
 37,
 179,
 6,
 8,
 315,
 3,
 3,
 227,
 331,
 291,
 4,
 434,
 4,
 1851,
 290,
 167,
 169,
 10,
 7,
 331,
 993,
 705,
 30,
 4,
 1851,
 76,
 4,
 495,
 1698,
 1111,
 4,
 6,
 18,
 4,
 3,
 217,
 35,
 3,
 145,
 3,
 5,
 1354,
 242,
 31,
 4,
 3,
 3,
 6,
 5,
 6,
 6,
 6,
 4,
 3,
 219,
 145,
 3,
 5,
 35,
 3,
 4,
 6,
 18,
 4,
 3,
 188,
 3,
 5,
 3,
 179,
 6,
 8,
 35,
 3,
 118,
 237,
 2253,
 1116,
 283,
 1111,
 32,
 4,
 5,
 9,
 5,
 143,
 23,
 669,
 5,
 5,
 3,
 3,
 3,
 535,
 144,
 3,
 3,
 288,
 204,
 179,
 6,
 8,
 204,
 841,
 5,
 315,
 3,
 3,
 153,
 13,
 8,
 101,
 99,
 140,
 146,
 3,
 4,
 3,
 3,
 6,
 5,
 6,
 6,
 6,
 871,
 6,
 8,
 7,
 4,
 1851,
 41,
 23,
 69,
 4,
 6,
 18,
 4,
 3,
 197,
 141,
 576,
 7,
 13,
 645,
 75,
 58,
 7,
 638,
 585,
 108,
 47,
 179,
 6,
 8,
 35,
 3,
 4,
 7,
 7,
 61,
 272,
 1717,
 638,
 585,
 28,
 23,
 487,
 331,
 993,
 705,
 30,
 4,
 434,
 60,
 4,
 3,
 219,
 1111,
 79,
 6,
 6,
 3,
 5,
 4,
 6,
 18,
 161,
 433,
 174,
 30,
 4,
 434,
 4,
 1851,

In [23]:
new_tokenizer.save_pretrained("./sentencepiece/30000")

('./sentencepiece/30000\\tokenizer_config.json',
 './sentencepiece/30000\\special_tokens_map.json',
 './sentencepiece/30000\\tokenizer.json')