In [1]:
import transformers
import datasets
import nltk
import tokenizers
import regex
import colorama
import random

random.seed(16)

import segmentador
import tests
from config import *
print("Marker symbol (valid):", MARKER_VALID)
print("Marker symbol (noise):", MARKER_NOISE_START, MARKER_NOISE_END)

tests.load_registered_cases()

Marker symbol (valid): ✓
Marker symbol (noise): ❌s__ ❌e__
Loaded 1 test cases from './registered_test_cases.csv'.


In [2]:
seg_model = segmentador.Segmenter(local_files_only=True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [110]:
class DetectRecurrentNoise:
    RE_BARCODE = regex.compile(r"\*([\sA-Z0-9]+)\*")
    RE_PREAMBLE = regex.compile(
        r"^\s*(.{,60}?)[\s0-9]*" +
        "(?=C[aâ]mara\s*dos\s*deputados\s*(Proj|Req))",
        regex.IGNORECASE,
    )
    RE_BLANK_SPACES = regex.compile(r"\s+")
    
    @classmethod
    def _detect_barcode(cls, subpattern, text):
        pseudo_patterns = cls.RE_BARCODE.findall(text)
        
        if not pseudo_patterns:
            return text
        
        pseudo_patterns = sorted(set(pseudo_patterns))
        
        for pseudo_pattern in pseudo_patterns:
            pattern = list(cls.RE_BLANK_SPACES.sub("", pseudo_pattern))
            pattern.append("")
            pattern.insert(0, "")
            pattern = r"\s*".join(pattern)
            
            text = regex.sub(r"(\*" + pattern + r"\*" + pattern + ")", subpattern, text)
        
        return text
    
    @classmethod
    def _detect_preamble_noise(cls, subpattern, text):
        preamble = cls.RE_PREAMBLE.match(text)
    
        if not preamble or not preamble.group(1).strip():
            return text
        
        preamble_content = r"\s*".join(preamble.group(1).split(" "))
        preamble_content = preamble_content.replace(")", r"\)")
        preamble_content = preamble_content.replace("(", r"\(")
        text = regex.sub(r"(\s*" + preamble_content + r"[\s\d]*)", subpattern, text)
        return text
    
    @classmethod
    def sub(cls, subpattern: str, text: str, *args, **kwargs):
        text = cls._detect_barcode(subpattern, text)
        text = cls._detect_preamble_noise(subpattern, text)
        return text


VALID_ROMAN_NUM = r"M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I?X|I?V|V?I{1,3})"
BASE_LEGAL_ITEMS = (
    r"§\s*[0-9]+",
    r"Art(?:igo)?s?\s*\.?\s*(?:[-–º0-9A-Z]+|\.{3}|[uú]nico)",
    r"(?:\s[A-Za-z]|[0-9]{1,2})\s*\)",
    r"(?:par[áa]grafo|§)\s*[úu]nico",
    r"(?:par[áa]grafo|§)\s*[0-9]{1,2}[\so0º]*:",
    r"(?:sub)?se[çc][ãa]o",
    r"\(?" + f"{VALID_ROMAN_NUM}" + r"\s*(?:[-–\)\.])",
    r"(?<!Art(?:igo)?\s?\.?\s?)\(?\s+[0-9]{1,2}[\s0oº]*(?:[-–\)]|\.(?![0-9]))",
)
EXTRA_LEGAL_ITEMS = (
    r"Sala\s*das?\s*(Sessões|comiss[aã]o|Reuni[oõ]es).{,200}$",
    r"Senado\s*Federal\s*,.{,200}$",
    r"C[aâ]mara\s*dos\s*Deputados\s*,.{,200}$",
    r"•",
)

QUOTES = r"”“\"'"

RE_NOISE_BLOCKS = (
    regex.compile(
        r"((PL|PDL|PEC)\s*n[\.o\sº]*[\d\s]+/[\s\d]+)?+\s*"
        r"A\s*p\s*r\s*e\s*s\s*e\s*n\s*t\s*a\s*[cç]\s*[aã]\s*o\s*:"
        r"(\s*\d\s*){2}/(\s*\d\s*){2}/(\s*\d\s*){6}:(\s*\d){2}",
        regex.IGNORECASE | regex.MULTILINE,
    ),
    regex.compile(r"([0-9]{9,})"),
    regex.compile(r"(_{9,})"),
    regex.compile(r"(^[\s0-9]+|[\s0-9]+$)"),
    *[
        regex.compile(
            r"(?<=[:\?;\." + QUOTES + r"]\s*(?:e|ou)?\s*)([0-9]+)(?=\s*" + legal_item + ")",
            regex.IGNORECASE,
        )
        for legal_item in (*BASE_LEGAL_ITEMS, "cap[ií]tulo", "t[íi]tulo")
    ],
)

STANDARD_PREFIXES = (
    r"(?:^|;(?:\s*e|\s*ou)?|[\.:\?]|\(\s*(?:NR|AC)\s*\)|" +
    f"[{QUOTES}])"
)
PREFIX_EXTENSIONS = (
    f"(?:\s*{MARKER_NOISE_START}" + r".{,300}?" + f"{MARKER_NOISE_END}\s*(?:[0-9]+_[A-Z]+)?\s*)"
)

RE_PRE_BLOCKS = tuple(
    regex.compile(f"(?<={STANDARD_PREFIXES}{PREFIX_EXTENSIONS}?)(?=\s*{pattern})", regex.IGNORECASE)
    for pattern in (*BASE_LEGAL_ITEMS, *EXTRA_LEGAL_ITEMS)
)

DEPT_EXTENSION_CORE = (
    r"(?:(?:Sra?|Senhora?)?[\s\.]*(?:Deputad[oa]|Dep\.)|" +
    r"(?:Sra?|Senhora?)[\s\.]*(?:Deputad[oa]|Dep\.)?|mesa\s*(?:diretora)?)" +
    r"\s*"
)

DEPT_EXTENSION_A = (
    r"[^\(]{,100}\(\s*(?:D[oa])?\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}\)]" + r"{1,100})?\)"
)

DEPT_EXTENSION_B = (
    r".{,100}?D[oa]\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}]" + r"{1,100}" + f"?(?=[{QUOTES}]))?"
)

DEPT_EXTENSION = f"(?:{DEPT_EXTENSION_A}|{DEPT_EXTENSION_B})"
DATE_AND_ID = r"(?:(?:DE\s*)+?[\._0-9]+|N[\.\s]*[o0º](?:[^,]*?[,\.]+\s*(?:DE\s*)+?[\._0-9]+)?)"


RE_SPECIAL = (
    (regex.compile(
        r"(?<=^.{,250}?)(REQUERIMENTO\s*DE\s*INFORMA[cÇ](?:[oÕ]ES|[AÃ]O).{,50}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        "(.{,600}?)(?=(?:Excelent[ií]ssim[oa])?\s*(?:Senhora?|Sra?)[\.\s*]Presidente)", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "),
    (regex.compile(
        r"(?<=^.{,250}?)(REQUERIMENTO.{,25}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        "(.{,600}?)(?=(?:Excelent[ií]ssim[oa])?\s*(?:Senhora?|Sra?)[\.\s*]Presidente)", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "),
    (regex.compile(
        r"(REQUERIMENTO.{,25}?" +
        f"(?:{DATE_AND_ID})?" +
        DEPT_EXTENSION_A +
        r")\s*" +
        "(.{,600}?)(?=(?:Excelent[ií]ssim[oa])?\s*(?:Senhora?|Sra?)[\.\s*]Presidente)", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "),
    (regex.compile(
        r"(INDICA[CÇ][AÃ]O.{,50}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        "(.{,600}?)(?=(?:Excelent[ií]ssim[oa])?\s*(?:Senhora?|Sra?)[\.\s*](?:Presidente|Ministr[oa]))", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "),
    (regex.compile(
        r"(?<=^.{,250}?)((?:PROJETO\s*DE\s*)?RESOLU[CÇ][AÃ]O.{,50}?" + 
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)((?:A\s*mesa\s*d)?A\s*C[âa]mara\s*dos\s*deputados[^\.]*?resolve\s*:)", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} " + r"\3" + f" {symb} {deb} "),
    (DetectRecurrentNoise, lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1" + f" {symb_end} {deb} "),
)

RE_PRE_POST_BLOCKS = (
    regex.compile(r"(?<=^.{,250}?)(COMISS[AÃ]O\s*DE\s*CI[EÊ]NCIA[\sE]*TECNOLOGIA[\s,]*COMUNICA[CÇ][AÃ]O[\sE]*INFORM[AÁ]TICA)", regex.IGNORECASE),
    regex.compile(r"(O\s*Congresso\s*Nacional\s*decreta:)", regex.IGNORECASE),
    regex.compile(r"(A\s*C[aâ]mara\s*dos\s+deputados\s*decreta:)", regex.IGNORECASE),
    regex.compile(r"(?<=^.{,250}?)(Projeto\s*de\s*Lei\s*" + DEPT_EXTENSION + ")", regex.IGNORECASE),
    regex.compile(
        r"(?<=^.{,250}?)(Projeto\s*de\s*Decreto\s*Legislativo\s*" +
        f"(?:{DEPT_EXTENSION}|{DATE_AND_ID})" +
        ")",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(?<=^.{,250}?)(Proposta\s*de\s*emenda\s*(?:cons?titucional|[aàá]\s*constitui[çc][ãa]o).*?" +
        f"(?:{DEPT_EXTENSION})" +
        r")",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(cap[ií]tulo\s*" + f"{VALID_ROMAN_NUM}" +
        r"(?:[-–\sA-Za-zçàüáéíóúãõẽôâê0-9]|" + 
        f"{MARKER_NOISE_END}|{MARKER_NOISE_START}" +
        r")+?" +
        f"(?=(?:{MARKER_VALID}|Art)))",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(t[ií]tulo\s*" + f"{VALID_ROMAN_NUM}" +
        r"(?:[-–\sA-Za-zçàüáéíóúãõẽôâê0-9]|" + 
        f"{MARKER_NOISE_END}|{MARKER_NOISE_START}" +
        r")+?" +
        f"(?=(?:{MARKER_VALID}|cap[íi]tulo)))",
        regex.IGNORECASE,
    ),
)

def regex_legal_item_anymatch(text: str, debug: bool = False) -> str:
    aid = 0
    
    for i, reg in enumerate(RE_NOISE_BLOCKS, aid):
        debug_text = f"{i}_NOISE" if debug else ""
        text = reg.sub(f" {MARKER_NOISE_START} {debug_text} " + r"\1" + f" {MARKER_NOISE_END} {debug_text} ", text, concurrent=True)
    
    for i, (reg, fun) in enumerate(RE_SPECIAL, aid):
        debug_text = f"{i}_SPECIAL" if debug else ""
        try:
            pat = fun(MARKER_VALID, debug_text)
            
        except TypeError:
            pat = fun(MARKER_NOISE_START, MARKER_NOISE_END, debug_text)
            
        text = reg.sub(pat, text, concurrent=True)
        
    for i, reg in enumerate(RE_PRE_BLOCKS, aid):
        debug_text = f"{i}_PRE" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    for i, reg in enumerate(RE_PRE_POST_BLOCKS, aid):
        debug_text = f"{i}_PRE_POS" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} " + r"\1" + f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    return text

In [111]:
df = datasets.load_dataset(
    "csv",
    data_files=["../data/content.txt"],
    header=None,
    names=["text"],
    cache_dir="../cache/datasets",
    nrows=30000,
)

RE_JUSTIFICATIVA = regex.compile(r"\s*(?:JUSTIFICATIVA|JUSTIFICA[CÇ][AÃ]O)")
RE_ANEXO = regex.compile(r"\s*ANEXO")

df = df.map(lambda item: {"text": RE_JUSTIFICATIVA.split(item["text"])[0]})
df = df.map(lambda item: {"text": RE_ANEXO.split(item["text"])[0]})
df = df.filter(lambda item: isinstance(item["text"], str) and 128 <= len(item["text"]) <= 600000)

def preprocess_instance(item, ind, print_preprocessed: bool = False, debug: bool = False):
    preprocessed_text = seg_model.preprocess_legal_text(item["text"])
    preprocessed_text = regex_legal_item_anymatch(preprocessed_text, debug=debug)
    tokens = nltk.tokenize.word_tokenize(preprocessed_text, language="portuguese")
    
    if print_preprocessed:
        print(preprocessed_text)
    
    labels = [0] * len(tokens)
    
    i = 0
    while i < len(tokens) - 1:
        if tokens[i] in SPECIAL_SYMBOLS:
            token = tokens.pop(i)
            labels.pop(i)
            labels[i] = SPECIAL_SYMBOLS[token]
            continue
            
        i += 1
    
    if labels[0] == SPECIAL_SYMBOLS[MARKER_VALID]:
        labels[0] = 0
    
    if tokens[-1] in SPECIAL_SYMBOLS:
        labels.pop()
        tokens.pop()

    ret = {
        "id": str(ind),
        "labels": labels,
        "tokens": tokens,
    }
    
    return ret

df = df.map(preprocess_instance, with_indices=True, num_proc=8, remove_columns="text")

Using custom data configuration default-46d85cc9d3ffee06
Reusing dataset csv (../cache/datasets/csv/default-46d85cc9d3ffee06/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at ../cache/datasets/csv/default-46d85cc9d3ffee06/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-c7b98ec7cdffacf2.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-46d85cc9d3ffee06/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-68fb4061d3cabd0d.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-46d85cc9d3ffee06/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-c2ee4974a80c385c.arrow


In [112]:
# id_ = 1261
# print(tests.TEST_CASES[id_])
# tests.print_results(df, id_, print_full_text=True)
tests.run_tests(df["train"]["labels"])
print("\n\n")

print(df.num_rows)

# 1162

document_idx = 16000
while tests.test_case_exists(document_idx):
    document_idx = random.randint(0, 1 + df["train"].num_rows)

print("Chosen id:", document_idx)
expected_test_case_values = tests.print_results(df, document_idx, print_full_text=True)
print("Is it correct? [y/N]:", end=" ")
inp = input()
if inp == "y":
    tests.update_test_case(document_idx, expected_test_case_values)
    print("Added to test cases.")

Correct proportion: 100.00% (40 of 40)



{'train': 29934}
Chosen id: 16000
INDICAÇÃO N.° DE 2005 ( Do Sr. Carlos Nader ) “ Sugere a implantação do Programa Escola que Protege no Município de Queimados – RJ. ” Excelentíssimo Senhor Ministro da Educação : Com cordiais cumprimentos , venho por meio desta sugerir a Vossa Excelência , a implantação do Programa Escola que Protege no Município de Queimados – RJ . O Programa supra citado , tem por objetivo quebrar o ciclo da violência , atendendo crianças e adolescentes em situação de risco , oferecendo ainda apoio psico-sócio-pedagógico a pais e responsáveis , buscando quebrar este ciclo em seu cotidiano ; e a capacitação de educadores , despertando sua atenção , percepção e responsabilidade no encaminhamento dos casos de vítimas de violência . A proteção da criança e do adolescente contra qualquer forma de violência é um dever de toda sociedade . Se não agirmos em sua defesa em decorrência de um espírito de amor e solidariedade , que o faça

In [90]:
tests.dump_registered_cases()

Wrote 39 test cases at './registered_test_cases.csv'.


In [None]:
df["train"]["labels"]

In [None]:
def tokenize_and_align_labels(examples):
    # source: https://huggingface.co/docs/transformers/custom_datasets#preprocess
    tokenized_inputs = seg_model.tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )

    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs


df_tokenized = df.map(tokenize_and_align_labels, batched=True, num_proc=4)

In [None]:
df_tokenized_train_eval_test = df_tokenized["train"].train_test_split(test_size=0.2, shuffle=True, seed=16)
df_tokenized_test_eval = df_tokenized_train_eval_test["test"].train_test_split(test_size=0.5, shuffle=False)
df_tokenized_split = datasets.DatasetDict({
    "train": df_tokenized_train_eval_test["train"],
    "eval": df_tokenized_test_eval["train"],
    "test": df_tokenized_test_eval["test"],
})
# df_tokenized_split.save_to_disk("../data/df_tokenized_split")
df_tokenized_split

In [None]:
df_tokenized_split["train"].features

In [None]:
print(df["train"]["labels"][49])