In [1]:
import os
import random

import transformers
import datasets
import nltk
import tokenizers
import regex
import colorama


import segmentador
import tests
from config import *


random.seed(72)
print("Marker symbol (valid):", MARKER_VALID)
print("Marker symbol (noise):", MARKER_NOISE_START, MARKER_NOISE_END)

TESTS_DIR = "test_cases"
DATASET_ROW_START = 60001
DATASET_ROW_END = 70000
TEST_CASE_URI = os.path.join(".", TESTS_DIR, f"{DATASET_ROW_START}_{DATASET_ROW_END}_registered_test_cases.csv")

tests.load_registered_cases(test_cases_uri=TEST_CASE_URI)

Marker symbol (valid): ✓
Marker symbol (noise): ❌s__ ❌e__
No test cases found at './test_cases/60001_70000_registered_test_cases.csv'.


In [2]:
seg_model = segmentador.Segmenter(local_files_only=True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [132]:
class DetectRecurrentNoise:
    RE_BARCODE = regex.compile(r"\*([\sA-Z0-9]+)\*")
    RE_PREAMBLE = regex.compile(
        r"^\s*(.{,60}?)[\s0-9]*" +
        r"(?=C[aâ]mara\s*dos\s*deputados\s*(Proj|Req))",
        regex.IGNORECASE,
    )
    RE_BLANK_SPACES = regex.compile(r"\s+")
    
    @classmethod
    def _detect_barcode(cls, subpattern, text):
        pseudo_patterns = cls.RE_BARCODE.findall(text)
        
        if not pseudo_patterns:
            return text
        
        pseudo_patterns = sorted(set(pseudo_patterns))
        
        for pseudo_pattern in pseudo_patterns:
            pattern = list(cls.RE_BLANK_SPACES.sub("", pseudo_pattern))
            pattern.append("")
            pattern.insert(0, "")
            pattern = r"\s*".join(pattern)
            
            text = regex.sub(r"(\*" + pattern + r"\*" + pattern + ")", subpattern, text)
        
        return text
    
    @classmethod
    def _detect_preamble_noise(cls, subpattern, text):
        preamble = cls.RE_PREAMBLE.match(text)
    
        if not preamble or not preamble.group(1).strip():
            return text
        
        preamble_content = r"\s*".join(preamble.group(1).split(" "))
        preamble_content = regex.escape(preamble_content)
        text = regex.sub(r"(\s*" + preamble_content + r"[\s\d]*)", subpattern, text)
        return text
    
    @classmethod
    def sub(cls, subpattern: str, text: str, *args, **kwargs):
        text = cls._detect_barcode(subpattern, text)
        text = cls._detect_preamble_noise(subpattern, text)
        return text


VALID_ROMAN_NUM = r"M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I?X|I?V|V?I{1,3})"
NRO = r"[nN](?:[uú]me)?(?:ro)?[\.\s]*[o0º°]"
QUOTES = r"”“”\"'‘"
UPPERCASE_LETTERS = r"ÀÁÂÃÇÉÊẼÓÕÔÜÚÍA-Z"
UPPERCASE_LETTERS_OR_NUM = UPPERCASE_LETTERS + r"0-9"

COMMISSIONS = (r"""
    COMISS[AÃ]O\s*(?:D[EOA]S?)\s*
    (?:
    AGRICULTURA[,\s]*PECU[AÁ]RIA[,\s]*ABASTECIMENTO[E\s]*(?:DES\.|DESENVOLVIMENTO)\s*RURAL|
    CI[EÊ]NCIA[E\s]*TECNOLOGIA[,\s]*COMUNICA[CÇ][AÃ]O[E\s]*INFORM[AÁ]TICA|
    CONSTITUI[CÇ][AÃ]O[E\s]*JUSTI[CÇ]A[E\s]*DE\s*CIDADANIA|
    CULTURA|
    DEFESA\s*DO\s*CONSUMIDOR|
    (?:DES\.|DESENVOLVIMENTO)\s*ECON[OÔ]MICO[,\s]*IND[UÚ]STRIA[,\s]*COM[EÉ]RCIO[E\s]*SERVI[CÇ]OS|
    (?:DES\.|DESENVOLVIMENTO)\s*URBANO|
    DIREITOS\s*DA\s*MULHER|
    DIREITOS\s*DA\s*PESSOA\s*IDOSA|
    DIREITOS\s*DAS\s*PESSOAS\s*COM\s*DEFICI[EÊ]NCIA|
    DIREITOS\s*HUMANOS[E\s]*MINORIAS|
    EDUCA[CÇ][AÃ]O|
    ESPORTE|
    FINAN[CÇ]AS[E\s]*TRIBUTA[CÇ][AÃ]O|
    FISCALIZA[CÇ][AÃ]O\s*FINANCEIRA[E\s]*CONTROLE|
    INTEGRA[CÇ][AÃ]O\s*NACIONAL[,\s]*(?:DES\.|DESENVOLVIMENTO)\s*REGIONAL[E\s]*AMAZ[OÔ]NIA|
    LEGISLA[CÇ][AÃ]O\s*PARTICIPATIVA|
    MEIO\s*AMBIENTE[E\s]*DESENVOLVIMENTO\s*SUSTENT[AÁ]VEL|
    MINAS[E\s]*ENERGIA|
    RELA[CÇ][OÕ]ES\s*EXTERIORES[E\s]*DE\s*DEFESA\s*NACIONAL|
    SEGURAN[CÇ]A\s*P[UÚ]BLICA[E\s]*COMBATE\s*AO\s*CRIME\s*ORGANIZADO|
    SEGURIDADE\s*SOCIAL[E\s]*FAMÍLIA|
    TRABALHO[,\s]*ADMINISTRA[CÇ][AÃ]O[E\s]*SERVI[CÇ]O\s*P[UÚ]BLICO|
    TURISMO|
    VIA[CÇ][AÃ]O[E\s]*TRANSPORTES
    )
    """.replace(" ", "").replace("\n", "")
)

MINISTRIES = "|".join((
    "MAPA",
    "MC",
    "MCTI",
    "MCom",
    "MinC",
    "MD",
    "MDR",
    "ME",
    "MEC",
    "MI",
    "MJSP",
    "MMA",
    "MME",
    "MMFDH",
    "MRE",
    "MS",
    "MTP",
    "MTur",
    "CGU",
    "SeGov",
    "SGPR",
    "CC",
    "GSI",
    "AGU",
    "MAER",
    "MESA",
    "MINTER",
    "MInfra",
    "MPA",
    "MPS",
    "SMPE",
    "SAE",
    "PR",
    "SEPPIR",
    "SNPM",
    "SRI",
    "SNPTA",
    "SAC",
))

BASE_LEGAL_ITEMS = (
    r"§\s*[0-9]+",
    r"Art(?:igo)?s?\s*\.?\s*(?:[-–º°0-9A-Z]+|\.{3}|[uú]nico)",
    r"(?:\(\s*|\s+)(?:[A-Za-z]|[0-9]{1,2})\s*\)",
    r"(?:par[áa]grafo|§)\s*[úu]nico",
    r"(?:par[áa]grafo|§)\s*[0-9]{1,2}[\so0º°]*:",
    r"(?:sub)?se[çc][ãa]o",
    r"\(?" + f"{VALID_ROMAN_NUM}" + r"\s*(?:[-–\)\.])",
    r"(?<!Art(?:igo)?s?\s?\.?\s?)\(?\s+[0-9]{1,2}[\s0oº°]*(?:[-–\)]|\.(?![0-9]))",
    r"(?<!Art(?:igo)?s?\s?\.?\s?)\s+[0-9]{1,2}\s*\.\s*[0-9]+",
)

MONTHS = "|".join((
    r"[jJ]an(?:eiro)?",
    r"[fF]ev(?:ereiro)",
    r"[mM]ar(?:[cç]o)",
    r"[aA]br(?:il)?",
    r"[mM]ai(?:o)?",
    r"[jJ]un(?:ho)?",
    r"[jJ]ul(?:ho)?",
    r"[aA]go(?:sto)?",
    r"[sS]et(?:embro)?",
    r"[oO]ut(?:ubro)?",
    r"[nN]ov(?:embro)?",
    r"[dD]ez(?:embro)?",
))

DATE = (
    r"[,\s]*(?:(?:de|em)[,0-9\s]*){1,3}[0-9]{4}|" +
    r"[,\s]*(?:de|em)?\s*[0-9]{,2}\s*(?:de|em)\s*(?:" + MONTHS + r")\s*(?:de|em)\s*[0-9]{4}"
)

DATE_OR_UNDERSCORES = (
    r"[,\s]*(?:(?:de|em)[,\._0-9\s]*){1,3}[\._0-9]{4}|" +
    r"[,\s]*(?:de|em)?\s*[\._0-9]{,2}\s*(?:de|em)\s*(?:" + MONTHS + r"|_+)\s*(?:de|em)\s*[\._0-9]{4}"
)

EOF = r".{,300}$"

EOF_OR_DATE = (
    r"(?:" +
    EOF +
    r"|" +
    DATE + 
    r")"
)

EXTRA_LEGAL_ITEMS = (
    r"Sala\s*d[ea]s?\s*(?:sess|comiss|reuni)(?:[õo]es|[ãa]o)" + EOF_OR_DATE,
    r"Senado\s*Federal\s*," + EOF_OR_DATE,
    r"C[aâ]mara\s*dos\s*Deputados\s*," + EOF_OR_DATE,
    r"Bras[ií]lia\s*,\s*(?:" + DATE_OR_UNDERSCORES + ")\s*" + EOF,
    r"•",
    r"As?\s*mesas?\s*da\s*c[aâ]mara\s*dos\s*deputados[^:]{,300}?:"
)

RE_NOISE_BLOCKS = (
    regex.compile(
        r"((?:(?:PL|PDL|PEC)\s*n[\.o\sº]*[\d\s]+/[\s\d]+)?+\s*"
        r"A\s*p\s*r\s*e\s*s\s*e\s*n\s*t\s*a\s*[cç]\s*[aã]\s*o\s*:"
        r"(?:\s*\d\s*){2}/(?:\s*\d\s*){2}/(?:\s*\d\s*){6}:(?:\s*\d){2})",
        regex.IGNORECASE | regex.MULTILINE,
    ),
    regex.compile(f"(?<!{NRO}[\s0-9]*)" + r"([0-9]{9,})"),
    regex.compile(r"(_{9,}\s*)+"),
    regex.compile(r"(^[\s0-9]+|[\s0-9]+$)"),
    *[
        regex.compile(
            r"(?<=[:\?;\." + QUOTES + r"]\s*(?:e|ou)?\s*)([0-9]+)(?=\s*" + legal_item + r")",
            regex.IGNORECASE,
        )
        for legal_item in (*BASE_LEGAL_ITEMS, r"cap[ií]tulo", r"t[íi]tulo")
    ],
)

STANDARD_PREFIXES = (
    r"(?:^|;(?:\s*e|\s*ou)?|[\.:\?]|\(\s*(?:NR|AC)\s*\)\s*|" +
    f"[{QUOTES}])"
)
PREFIX_EXTENSIONS = (
    r"(?:\s*" + MARKER_NOISE_START + r".{,300}?" + MARKER_NOISE_END + r"\s*(?:[0-9]+_[A-Z]+)?\s*)"
)

RE_PRE_BLOCKS = tuple(
    regex.compile(f"(?<={STANDARD_PREFIXES}{PREFIX_EXTENSIONS}?)(?=\s*{pattern})", regex.IGNORECASE)
    for pattern in (*BASE_LEGAL_ITEMS, *EXTRA_LEGAL_ITEMS)
)

ADDITIONAL_TITLES = r"(?:Ju[ií]za?|M[\.\s]*Ma?[\s\.]*|Doutora?|Dra?[\s\.]*|Professora?|Profa?[\s\.]*)*"

DEPT_EXTENSION_CORE = (
    r"(?:(?:Sra?|Senhora?)?[\s\.]*(?:Deputad[oa]|Dep\s*\.)\s*" + ADDITIONAL_TITLES + "|" +
    r"(?:Sra?|Senhora?)[\s\.]*(?:Deputad[oa]|Dep\s*\.)?\s*" + ADDITIONAL_TITLES + "|" +
    r"mesa\s*(?:diretora)?|" +
    r"MENSAGEM\s*" + NRO + r"|" +
    r"poder\s*(?:executivo|legislativo|judici[aá]rio)|" + 
    COMMISSIONS +
    r")\s*"
)

DEPT_EXTENSION_A = (
    r"[^\(]{,100}\(\s*(?:D[oa])?\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}\)]" + r"{1,100})?\)"
)

DEPT_EXTENSION_B = (
    r".{,100}?D[oa]\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}]" + r"{1,100}" + f"?(?=[{QUOTES}]))?"
)

DEPT_EXTENSION = f"(?:{DEPT_EXTENSION_A}|{DEPT_EXTENSION_B})"
# DATE_AND_ID = (
#     r"(?:" +
#     r"(?:DE\s*)+?[\._0-9]+|" +
#     NRO +
#     r"(?:[^,]*?[,\.]+\s*(?:DE\s*)+?[\._0-9]+)?" +
#     r")"
# )
DATE_AND_ID = (
    r"(?:" +
    r"(?:DE\s*)+?[\._0-9]+|" +
    f"(?:{NRO}" + r"[\s0-9]*)?\s*(?:" + DATE_OR_UNDERSCORES + r")|" +
    NRO + r"[\s0-9]*" +
    r"(?:[^,]*?[,\.]+\s*(?:DE\s*)+?[\._0-9]+)?" +
    r")"
)
# DATE

fn_lambda_double = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "
fn_lambda_triple = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} " + r"\3" + f" {symb} {deb} "

REQUEST_PRESIDENT_OR_MINISTRY = "(?:(?<=\.\s*)(?:Excelent[ií]ssim[oa]|Ex\.?m[ao]\s*\.?)?\s*(?:Senhora?|Sra?)[\.\s]*(?:Presidente|Presid|Pres|Min|Ministr[oa]).{,75}?[,:]|(?:(?<=\.\s*)Requeiro))"

RE_SPECIAL = (
    (regex.compile(
        r"(?<=^.{,250}?)(REQUERIMENTO\s*DE\s*INFORMA[CÇ](?:[OÕ]ES|[AÃ]O).{,50}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile(
        r"(?<=^.{,250}?)(REQUERIMENTO.{,25}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile(
        r"(REQUERIMENTO.{,25}?" +
        f"(?:{DATE_AND_ID})?" +
        DEPT_EXTENSION_A +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile(
        r"((?:REQUERIMENTO\s*DE\s*)?INDICA[CÇ][AÃ]O.{,50}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile(
        r"(?<=^.{,250}?)((?:PROJETO\s*DE\s*)?RESOLU[CÇ][AÃ]O.{,50}?" + 
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)((?:A\s*mesa\s*d)?A\s*C[âa]mara\s*dos\s*deputados[^\.]*?resolve\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile(
        r"(?<=^.{,250}?)((?:PROJETO\s*DE\s*)?RESOLU[CÇ][AÃ]O.{,50}?" + 
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)((?:A\s*mesa\s*d)?A\s*C[âa]mara\s*dos\s*deputados[^\.]*?resolve\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile(
        r"(?<=^.{,250}?)(MEDIDA\s*PROVIS[ÓO]RIA.{,50}?" + 
        DATE_AND_ID +
        r")\s*" +
        r"(.{,1200}?)([OA]\s*President[ea]\s*da\s*rep[úu]blica[^:]+?com\s*for[cç]a\s*de\s*lei\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (DetectRecurrentNoise, lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1" + f" {symb_end} {deb} ", None),
    (regex.compile(r"([:;\?])(\s{,10}[-–])"), lambda symb, deb: r"\1" + f" {symb} {deb} " + r"\2", None),
    (regex.compile(r"(\.)(\s{,10}[-–])(?=\s*[" + UPPERCASE_LETTERS + "])"), lambda symb, deb: r"\1" + f" {symb} {deb} " + r"\2", None),
    (regex.compile(r"(?<=,\s*(?:e|ou)\s*)" + f"(?={BASE_LEGAL_ITEMS[2]})"), lambda symb, deb: f" {symb} {deb} ", None),
    (regex.compile(
        r"(EMI\s*" + DATE_AND_ID + r"\s*[0-9][0-9\s]*" + f"(?:(?:{MINISTRIES})/?)+" + r")"
        r"(\s*[^,]{,50}?,\s*(?:" + DATE + r")[\.\s]*)?"
    ),
    fn_lambda_double, None),
    (regex.compile(
        r"(?<=^.{,250}?)(TVR\s*" + DATE_AND_ID + DEPT_EXTENSION + ")"
        r"\s*((?:mensagem|msc[\s\.]*)\s*" + NRO + "[0-9\s]+/\s*[0-9]{4})" +
        r"\s*((?:aviso|av[\s\.]*)\s*" + NRO + "[0-9\s]+/\s*[0-9]{4}" +
        r"(?:\s*[-–]\s*C\s*\.\s*Civil)?)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile(
        r"(?<=^.{,100}?)(PROJETO\s*DE)(\s*" +
        r"(?:" +
        r"LEI|" +
        r"DECRETO\s*LEGISLATIVO|" +
        r"RESOLU[ÇC][AÃ]O|" +
        r"EMENDA\s*CONSTITUICIONAL|" +
        r"EMENDA\s*[AÁÀ]\s*CONSTITUI[CÇ][AÃ]O|" +
        r"MEDIDA\s*PROVIS[OÓ]RIA"
        r")\s*" +
        f"(?:{NRO})?" +
        r"\s*[\s" + UPPERCASE_LETTERS_OR_NUM + r"]{,150}?" +
        r"(?=(?:[OA]\s+)?[\." + UPPERCASE_LETTERS + "][a-z])" +
        r")"),
    lambda symb, deb: f" {symb} {deb} " + MARKER_INTENDED_CORRUPTION + r"\1" + MARKER_INTENDED_CORRUPTION + r"\2" + f" {symb} {deb} ", None
    ),
    (regex.compile(r"(?<=[" + UPPERCASE_LETTERS + "]{3,}\s+)([0-9]{1,2}\s*\.\s*[0-9]+)"),
    lambda symb, deb: f" {symb} {deb} " + r"\1", None
    ),
    (regex.compile(
        r"(?<=\s)((?:Tel(?:efone)?|Fax)\s*:" +
        r"\s*(?:\(\s*[0-9]{2}\s*\))?\s*[0-9]{4}\s*[-–\.]?\s*[0-9]{4})", regex.IGNORECASE),
    lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1" + f" {symb_end} {deb}",
    lambda text: text.replace(")", ")" + MARKER_INTENDED_CORRUPTION).replace("(", "(" + MARKER_INTENDED_CORRUPTION)
    ),
    (regex.compile(r"(?<=\s)(CEP\s*:\s*[0-9]{2}[\s\.]*[0-9]{3}\s*[-–]\s*[0-9]{3})"),
     lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1" + f" {symb_end} {deb}",
     lambda text: text.replace(")", ")" + MARKER_INTENDED_CORRUPTION).replace("(", "(" + MARKER_INTENDED_CORRUPTION)
    ),
)


RE_PRE_POST_BLOCKS = (
    regex.compile(
        r"(ACORDO\s*DE[-,"
        + UPPERCASE_LETTERS_OR_NUM
        + r"\s]+)(?=(?:[OA]\s+)?["
        + UPPERCASE_LETTERS
        + r"][a-z])"
    ),
    regex.compile(r"(?<=^[^\()]{,250}?)(" + COMMISSIONS + r")", regex.IGNORECASE | regex.VERBOSE),
    regex.compile(r"(O\s*Congresso\s*Nacional\s*decreta\s*:)", regex.IGNORECASE),
    regex.compile(r"(C[ÂA]MARA\s*DOS\s*DEPUTADOS|CONGRESSO\s*NACIONAL)(?!\s*[dD][eE][cC][rR][eE][tT][aA])"),
    regex.compile(r"(A\s*C[aâ]mara\s*dos\s+deputados\s*decreta\s*:)", regex.IGNORECASE),
    regex.compile(r"(?<=^.{,250}?)(Projeto\s*de\s*Lei\s*" + DEPT_EXTENSION + r")", regex.IGNORECASE),
    regex.compile(
        r"(?<=^.{,250}?)(Projeto\s*de\s*(?:Decreto\s*Legislativo|Resolu[cç][aã]o)\s*" +
        f"(?:{DEPT_EXTENSION}|{DATE_AND_ID})" +
        r")",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(?<=^[^\(]{,250}?)(Mensagem\s*" + DATE_AND_ID + "\s*[0-9][0-9\s]*)",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(?<=^.{,250}?)(Proposta\s*de\s*emenda\s*(?:cons?titucional|[aàá]\s*constitui[çc][ãa]o).*?" +
        f"(?:{DEPT_EXTENSION})" +
        r")",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(cap[ií]tulo\s*" + f"(?:{VALID_ROMAN_NUM})" +
        r"(?:[-–\s" + UPPERCASE_LETTERS_OR_NUM + r"]|" + 
        f"{MARKER_NOISE_END}|{MARKER_NOISE_START}" +
        r")+?" +
        f"(?=(?:{MARKER_VALID}|Art)))",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"(t[ií]tulo\s*" + f"(?:{VALID_ROMAN_NUM})" +
        r"(?:[-–\s" + UPPERCASE_LETTERS_OR_NUM + r"]|" + 
        f"{MARKER_NOISE_END}|{MARKER_NOISE_START}" +
        r")+?" +
        f"(?=(?:{MARKER_VALID}|cap[íi]tulo)))",
        regex.IGNORECASE,
    ),
)

def regex_legal_item_anymatch(text: str, debug: bool = False) -> str:
    aid = 0
    
    for i, reg in enumerate(RE_NOISE_BLOCKS, aid):
        debug_text = f"{i}_NOISE" if debug else ""
        text = reg.sub(f" {MARKER_NOISE_START} {debug_text} " + r"\1" + f" {MARKER_NOISE_END} {debug_text} ", text, concurrent=True)
    
    for i, (reg, fun, fun_post) in enumerate(RE_SPECIAL, aid):
        debug_text = f"{i}_SPECIAL" if debug else ""
        try:
            pat = fun(MARKER_VALID, debug_text)
            
        except TypeError:
            pat = fun(MARKER_NOISE_START, MARKER_NOISE_END, debug_text)
            
        text = reg.sub(pat, text, concurrent=True)
        
        if fun_post is not None:
            text = fun_post(text)
        
    for i, reg in enumerate(RE_PRE_BLOCKS, aid):
        debug_text = f"{i}_PRE" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    for i, reg in enumerate(RE_PRE_POST_BLOCKS, aid):
        debug_text = f"{i}_PRE_POS" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} " + r"\1" + f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    return text

def preprocess_instance(item, ind, print_preprocessed: bool = False, debug: bool = False):    
    preprocessed_text = seg_model.preprocess_legal_text(item["text"])
    preprocessed_text = regex_legal_item_anymatch(preprocessed_text, debug=debug)
    preprocessed_text = preprocessed_text.replace(MARKER_INTENDED_CORRUPTION, "")
    tokens = nltk.tokenize.word_tokenize(preprocessed_text, language="portuguese")
    
    if print_preprocessed:
        print(colorama.Fore.WHITE, colorama.Style.DIM, preprocessed_text, colorama.Style.RESET_ALL, sep="")
    
    labels = [0] * len(tokens)
    
    i = 0
    while i < len(tokens) - 1:
        if tokens[i] in SPECIAL_SYMBOLS:
            cur_token = tokens.pop(i)
            cur_label = labels.pop(i)
            
            if cur_label == SPECIAL_SYMBOLS[MARKER_NOISE_START] and cur_token == MARKER_VALID:
                continue
            
            if cur_label == SPECIAL_SYMBOLS[MARKER_NOISE_END] and cur_token == MARKER_NOISE_START:
                labels[i] = 0
            else:
                labels[i] = SPECIAL_SYMBOLS[cur_token]
                
            continue
            
        i += 1
    
    if labels[0] == SPECIAL_SYMBOLS[MARKER_VALID]:
        labels[0] = 0
    
    if tokens[-1] in SPECIAL_SYMBOLS:
        labels.pop()
        tokens.pop()

    maybe_erase_pool = []
    noise_on = False
    
    for i in range(len(labels) - 1):
        if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END] and labels[i + 1] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
            labels[i] = labels[i + 1] = 0
          
    for i in range(len(labels)):
        if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
            maybe_erase_pool.clear()
            continue
        
        if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END]:
            while maybe_erase_pool:
                ind = maybe_erase_pool.pop()
                labels[ind] = 0
        
        if labels[i] > 0:
            maybe_erase_pool.append(i)
    
    for i in range(len(labels)):
        if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
            if noise_on:
                labels[i] = 0
            else:
                noise_on = True
            
        elif labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END]:
            if noise_on:
                noise_on = False
            else:
                labels[i] = 0
                
        elif labels[i] == SPECIAL_SYMBOLS[MARKER_VALID]:
            noise_on = False
            
    ret = {
        "id": str(ind),
        "labels": labels,
        "tokens": tokens,
    }
    
    return ret

auxaux = " Programa. ” ( NR ) Art . 11 . A Lei no 7.827 , de 27 de setembro de 1989 , passa a vigorar com as seguintes alterações : “ Art . 15 . .......................................................................... ............................................................................................... VI- exercer outras atividades inerentes à aplicação dos recursos , à recuperação dos créditos , inclusive nos termos definidos nos arts . 15-B , 15-C e 15-D , e à renegociação de dívidas , de acordo com as condições estabelecidas pelo Conselho Monetário Nacional . § 1o O Conselho Monetário Nacional , por meio de proposta do"
try:
    _=preprocess_instance({"text": auxaux}, -1, True, True)
except NameError:
    pass

[37m[2mPrograma. ” ( NR ) Art . 11 . A Lei no 7.827 , de 27 de setembro de 1989 , passa a vigorar com as seguintes alterações : “ ✓ 1_PRE  Art . 15 . .......................................................................... ............................................................................................... ✓ 6_PRE  VI- exercer outras atividades inerentes à aplicação dos recursos , à recuperação dos créditos , inclusive nos termos definidos nos arts . 15-B , 15-C e 15-D , e à renegociação de dívidas , de acordo com as condições estabelecidas pelo Conselho Monetário Nacional . ✓ 0_PRE  § 1o O Conselho Monetário Nacional , por meio de proposta do[0m


In [85]:
df = datasets.load_dataset(
    "csv",
    data_files=["../data/content.txt"],
    header=None,
    names=["text"],
    cache_dir="../cache/datasets",
    skiprows=DATASET_ROW_START,
    nrows=DATASET_ROW_END - DATASET_ROW_START + 1,
)

RE_JUSTIFICATIVA = regex.compile(
    r"\s*(?:" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A\s*T\s*I\s*V\s*A|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*t\s*i\s*v\s*a\s+(?=[A-Z])|" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A\s*[CÇ]\s*[AÃ]\s*O|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*[cç]\s*[aã]\s*o\s+(?=[A-Z])" +
    r")"
)

RE_ANEXO = regex.compile(r"\s*A\s*N\s*E\s*X\s*O")

df = df.filter(lambda item: isinstance(item["text"], str) and 128 <= len(item["text"]) <= 600000)
df = df.map(lambda item: {"text": RE_JUSTIFICATIVA.split(item["text"])[0]})
df = df.map(lambda item: {"text": RE_ANEXO.split(item["text"])[0]})

df = df.map(preprocess_instance, with_indices=True, num_proc=10, remove_columns="text")

rerun_tests = True

Using custom data configuration default-dcab5c59214e4eae
Reusing dataset csv (../cache/datasets/csv/default-dcab5c59214e4eae/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at ../cache/datasets/csv/default-dcab5c59214e4eae/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-3d01d6dade2f24bc.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-dcab5c59214e4eae/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-43b5ead514cf3478.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-dcab5c59214e4eae/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-e1f3ebb3dc770565.arrow


In [27]:
print(df.num_rows)

{'train': 9999}


In [86]:
# id_ = 6408
# print(tests.TEST_CASES[id_])
# tests.print_results(df, id_, print_full_text=True)
if rerun_tests:
    rerun_tests = False
    tests.run_tests(df["train"]["labels"])
    print("\n\n")

document_idx = 3051
while tests.test_case_exists(document_idx):
    document_idx = random.randint(0, 1 + df["train"].num_rows)

print(colorama.Fore.YELLOW + "Chosen id:" + colorama.Fore.RESET, document_idx, end="\n\n")

expected_test_case_values = tests.print_results(df, document_idx, print_full_text=True)
print("Is it correct? [y/N]:", end=" ")
inp = input()
if inp == "y":
    tests.update_test_case(document_idx, expected_test_case_values)
    print("Added to test cases.")

Correct proportion: 100.00% (3 of 3)



[33mChosen id:[39m 3051

[37m[2mMEDIDA PROVISÓRIA Nº 581 , DE 20 DE SETEMBRO DE 2012 . Dispõe sobre o Fundo de Desenvolvimento do Centro-Oeste - FDCO ; autoriza a União a conceder subvenção econômica às instituições financeiras oficiais federais , sob a forma de equalização de taxa de juros nas operações de crédito para investimentos no âmbito do FDCO ; altera as Leis nº 7.827 , de 27 de setembro de 1989 , e nº 10.177 , de 12 de janeiro de 2001 , que tratam das operações com recursos dos Fundos Constitucionais de Financiamento do Norte , do Nordeste e do Centro-Oeste ; constitui fonte adicional de recursos para ampliação de limites operacionais da Caixa Econômica Federal e do Banco do Brasil S.A. , e dá outras providências . A PRESIDENTA DA REPÚBLICA , no uso da atribuição que lhe confere o art . 62 da Constituição , adota a seguinte Medida Provisória , com força de lei : Art . 1o O Fundo de Desenvolvimento do Centro-Oeste - FDCO terá como ag

Is it correct? [y/N]: N


In [78]:
tests.dump_registered_cases(test_cases_uri=TEST_CASE_URI)

Wrote 2 test cases at './test_cases/60001_70000_registered_test_cases.csv'.


In [None]:
df["train"]["labels"]

In [None]:
def tokenize_and_align_labels(examples):
    # source: https://huggingface.co/docs/transformers/custom_datasets#preprocess
    tokenized_inputs = seg_model.tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )

    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs


df_tokenized = df.map(tokenize_and_align_labels, batched=True, num_proc=4)

In [None]:
df_tokenized_train_eval_test = df_tokenized["train"].train_test_split(test_size=0.2, shuffle=True, seed=16)
df_tokenized_test_eval = df_tokenized_train_eval_test["test"].train_test_split(test_size=0.5, shuffle=False)
df_tokenized_split = datasets.DatasetDict({
    "train": df_tokenized_train_eval_test["train"],
    "eval": df_tokenized_test_eval["train"],
    "test": df_tokenized_test_eval["test"],
})
# df_tokenized_split.save_to_disk("../data/df_tokenized_split")
df_tokenized_split

In [None]:
df_tokenized_split["train"].features

In [None]:
print(df["train"]["labels"][49])