In [3]:
from tokenizers import SentencePieceBPETokenizer
dataset = "wikipedia"
with open(f"./{dataset}_strokes.txt", "r", encoding="utf-8") as f:
    text = f.readlines()

In [10]:
tokenizer = SentencePieceBPETokenizer()
vocab_sizes = [2000, 3000, 4000]
for vocab_size in vocab_sizes:
    print(vocab_size)
    tokenizer.train_from_iterator(
        text,
        vocab_size=vocab_size,
        min_frequency=5,
        show_progress=True,
        limit_alphabet=500,
    )
    tokenizer.save(f"{dataset}/SentencePiece_{vocab_size}.json")

2000
3000
4000


In [11]:
from tokenizers import Tokenizer
file = f"{dataset}/SentencePiece_2000.json"
bpe_tokenizer = Tokenizer.from_file(file)
stroke_text = 'terduto eaieeeatneaseear tn2'
tokens = bpe_tokenizer.encode(stroke_text)
tokens.tokens

['▁terduto', '▁eaieeeatn', 'easeear', '▁tn2']

In [12]:
from collections import defaultdict
# Create dict for text into strokes translation and vice versa
with open("zh2letter.txt", 'r', encoding="utf-8") as f:
    conversions = f.read()

conversions = conversions.splitlines()
zh2letter = defaultdict(str)
letter2zh = defaultdict(str)
for line in conversions:
    chinese_char, strokes = line.split()
    zh2letter[chinese_char] = strokes
    letter2zh[strokes] = chinese_char

In [16]:
letter2zh["eaieeea"]

'車'

In [31]:
stroke_text = 'gteeaeeasoooo gteeaeeasoooo gtetbat gteeeatn0 gteoetodteetn'
tokens = bpe_tokenizer.encode(stroke_text)
tokens.tokens

['▁gteeaeeasoooo',
 '▁gteeaeeasoooo',
 '▁gtet',
 'b',
 'at',
 '▁gteeeatn0',
 '▁gteo',
 'eto',
 'd',
 'teetn']

In [40]:
for token in tokens.tokens:
    if token[0] == '▁':
        token = token[1:]    
    print(token, letter2zh[token])

gteeaeeasoooo 媽
gteeaeeasoooo 媽
gtet []
b []
at []
gteeeatn0 妹
gteo []
eto []
d []
teetn []


In [42]:
letter2zh["eaeeasoooo"]

'馬'