In [1]:
# Setup
import os

from service.util.path_util import PROJECT_ROOT
from storage.duplets_dictionary import duplets, new_duplets

BASE_DIR = PROJECT_ROOT
CORPUS_PATH = os.path.join(BASE_DIR, "storage", "pre_declension_corpus.txt")


def export_duplets_to_sp(p_duplets, output_path=CORPUS_PATH):
    with open(output_path, "w", encoding="utf-8") as f:
        for inflected, base in p_duplets:
            f.write(f"{inflected}\n")
            f.write(f"{base}\n")
    print(f"Exported {len(p_duplets) * 2} lines to {output_path}")


duplets = duplets + new_duplets
export_duplets_to_sp(duplets)

Exported 712 lines to D:\WORKSPACE\Python\scraper-news\storage\pre_declension_corpus.txt


In [2]:
#  Train SentencePiece tokenizer

import sentencepiece as spm

# noinspection PyUnresolvedReferences
spm.SentencePieceTrainer.train(
    input=CORPUS_PATH,
    model_prefix="pre-declension",
    vocab_size=300,
    character_coverage=1.0,
    model_type="unigram"
)

In [3]:
# Tokenize for T5
from transformers import T5Tokenizer

pre_declension_model = "pre-declension.model"
tokenizer = T5Tokenizer.from_pretrained(pre_declension_model, legacy=False)

print(tokenizer.tokenize("Șucu"))       # Expect: ['▁Șucu']
print(tokenizer.tokenize("Țiriac"))     # Expect: ['▁Țiriac']
print(tokenizer.tokenize("Ștefănescu")) # Expect: ['▁Ștefăneșcu']
print(tokenizer.tokenize("Ștefănești")) # Expect: ['▁Ștefănești']
print(tokenizer.tokenize("Țepelin")) # Expect: ['▁Țepelin']



['▁Șucu']
['▁Țiriac']
['▁Ștefănescu']
['▁Ștefănești']
['▁Țepelin']
