## Build segmenter model tokenizer

- Tokenizer dictionary is built using a corpus from the Brazilian legal domain;
- Dictionary has 6000 tokens (20% the size of Bertimbau's dictionary size);
- No tokenization preprocessing (such as text normalization, diacritic removal, case folding) was employed;
- BERT post-processing template injection and special tokens ([CLS], [SEP], [UNK] tokens) are maintained, keeping our segmenter model compatible with other popular transformer-based models.

In [None]:
import pathlib

import transformers
import pandas as pd
import regex

TOKENIZER_OUTPUT_DIR = "../tokenizers"

pathlib.Path(TOKENIZER_OUTPUT_DIR).mkdir(exist_ok=True, parents=True)

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    "neuralmind/bert-base-portuguese-cased",
    local_files_only=True,
    cache_dir="../cache/tokenizers",
)

In [None]:
UPPERCASE_LETTERS = "ÀÁÂÃÇÉÊẼÓÕÔÜÚÍA-Z\u0303\u0300\u0301\u0302\u0303\u0304\u0305\u0340\u0341\u0342\u0343"


RE_JUSTIFICATIVA = regex.compile(
    r"\s*(?:" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A?\s*T\s*I\s*V\s*A|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*t\s*i\s*v\s*a\s+(?=[" + UPPERCASE_LETTERS + r"])|" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A\s*[CÇ]\s*[AÂÃÀÁ]\s*O|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*[cç]\s*[aãâàá]\s*o\s+(?=[" + UPPERCASE_LETTERS + r"])" +
    r")"
)

RE_ANEXO = regex.compile(r"\s*A\s*N\s*E\s*X\s*O")

df = pd.read_csv(
    "../data/content.txt",
    usecols=["imgArquivoTeorPDF"],
    header=0,
    index_col=None,
).squeeze("columns")

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

good_inds = [i for i, text in enumerate(df) if isinstance(text, str) and 10 <= len(text)]
df = df.iloc[good_inds]

df = df.map(lambda item: RE_JUSTIFICATIVA.split(item)[0])
df = df.map(lambda item: RE_ANEXO.split(item)[0])

In [None]:
tokenizer = tokenizer.train_new_from_iterator(df, vocab_size=6000)

In [None]:
tokenizer.save_pretrained(TOKENIZER_OUTPUT_DIR)