In [160]:
import os
import random
import collections

import transformers
import datasets
import nltk
import tokenizers
import regex
import colorama

import segmentador
import tests
from config import *


DEV_RUN = True


random.seed(198)
print("Marker symbol (valid):", MARKER_VALID)
print("Marker symbol (noise):", MARKER_NOISE_START, MARKER_NOISE_END)

DATASET_ROW_START = None
DATASET_ROW_END = None
if DEV_RUN:
    TESTS_DIR = "test_cases"
    DATASET_ROW_START = 100001
    DATASET_ROW_END = 110000
    TEST_CASE_URI = os.path.join(".", TESTS_DIR, f"{DATASET_ROW_START}_{DATASET_ROW_END}_registered_test_cases.csv")

    tests.load_registered_cases(test_cases_uri=TEST_CASE_URI)

Marker symbol (valid): ✓
Marker symbol (noise): ❌s__ ❌e__
No test cases found at './test_cases/100001_110000_registered_test_cases.csv'.


In [3]:
seg_model = segmentador.Segmenter(local_files_only=DEV_RUN)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [213]:
DEBUG_PATTERN = "(?:\s*[0-9]{1,3}_[A-Z]{1,30}\s*)"
ALL_SPECIAL_MARKERS = f"(?:{MARKER_INTENDED_CORRUPTION}|{MARKER_NOISE_START}|{MARKER_NOISE_END}|{MARKER_VALID})"

COMMISSION_LIST = (r"""
    (?:
    AGRICULTURA[,\s]*PECU[AÁ]RIA[,\s]*ABASTECIMENTO[E\s]*(?:DES\s*\.|DESENVOLVIMENTO)\s*RURAL|
    CI[EÊ]NCIA[E\s]*TECNOLOGIA[,\s]*COMUNICA[CÇ][AÃ]O(?:[E\s]|DA)*INFORM[AÁ]TICA|
    CONSTITUI[CÇ][AÃ]O[E\s]*JUSTI[CÇ]A[E\s]*DE\s*CIDADANIA|
    CULTURA|
    DEFESA\s*(?:DO|AO)\s*CONSUMIDOR|
    (?:DES\s*\.|DESENVOLVIMENTO)\s*ECON[OÔ]MICO[,\s]*IND[UÚ]STRIA[,\s]*COM[EÉ]RCIO(?:[E\s]|DE)*SERVI[CÇ]OS|
    (?:DES\s*\.|DESENVOLVIMENTO)\s*URBANO|
    DIREITOS\s*DA\s*MULHER|
    DIREITOS\s*DA\s*PESSOA\s*IDOSA|
    DIREITOS\s*DAS\s*PESSOAS\s*COM\s*DEFICI[EÊ]NCIA|
    DIREITOS\s*HUMANOS(?:[E\s]|DAS)*MINORIAS|
    EDUCA[CÇ][AÃ]O|
    ESPORTE|
    FINAN[CÇ]AS[E\s]*TRIBUTA[CÇ][AÃ]O|
    FISCALIZA[CÇ][AÃ]O\s*FINANCEIRA[E\s]*CONTROLE|
    INTEGRA[CÇ][AÃ]O\s*NACIONAL[,\s]*(?:DES\s*\.|DESENVOLVIMENTO)\s*REGIONAL(?:[E\s]|DA)*AMAZ[OÔ]NIA|
    LEGISLA[CÇ][AÃ]O\s*PARTICIPATIVA|
    (?:MEIO\s*)?AMBIENTE[E\s]*DESENVOLVIMENTO\s*SUSTENT[AÁ]VEL|
    MINAS(?:[E\s]|DA)*ENERGIA|
    RELA[CÇ][OÕ]ES\s*EXTERIORES(?:[E\s]|DE)*\s*DEFESA\s*NACIONAL|
    SEGURAN[CÇ]A\s*P[UÚ]BLICA[E\s]*COMBATE\s*AO\s*CRIME\s*ORGANIZADO|
    SEGURIDADE\s*SOCIAL(?:[E\s]|DA)*FAMÍLIA|
    TRABALHO[,\s]*ADMINISTRA[CÇ][AÃ]O[E\s]*SERVI[CÇ]O\s*P[UÚ]BLICO|
    TURISMO|
    VIA[CÇ][AÃ]O[E\s]*TRANSPORTES|
    INQUÉRITO
    )
    """.replace(" ", "").replace("\n", "")
)

COMMISSIONS = (
    r"COMISS[AÃ]O\s*" +
    r"(?:" +
    r"(?:(?:D[EOA]S?|[,;\s]*E|[;,]\s*E?|PARLAMENTAR)\s*)+" +
    COMMISSION_LIST +
    r"\s*)+"
)


class DetectRecurrentNoise:
    RE_BARCODE = regex.compile(r"\*([\sA-Z0-9]+)\*")
    RE_PREAMBLE = regex.compile(
        r"^\s*(.{,60}?)[\s0-9]*" +
        r"(?=C[aâ]mara\s*dos\s*deputados\s*(Proj|Req))",
        regex.IGNORECASE,
    )
    RE_CAMARA_REPEATED = regex.compile(
        r"(?:(C[AÂ]MARA\s*)(DOS\s*)(DEPUTADOS[\s0-9]*(?!\s*[-–\.\)])))",
    )
    RE_COMMISSIONS_REPEATED = regex.compile(
        f"({COMMISSIONS})"
    )
    RE_SALA_DAS_SESSOES_CODE = regex.compile(
        r"(?<=Sala\s*das\s*sess[oõ]es\s*.{,150}?)([0-9]{1,5}\s*_\s*(?:" +
        MARKER_NOISE_START +
        r")?\s*[0-9]{1,5})",
        regex.IGNORECASE,
    )
    RE_BLANK_SPACES = regex.compile(r"\s+")
    
    @classmethod
    def _detect_barcode(cls, subpattern, text):
        pseudo_patterns = cls.RE_BARCODE.findall(text)
        
        if not pseudo_patterns:
            return text
        
        pseudo_patterns = sorted(set(pseudo_patterns))
        
        for pseudo_pattern in pseudo_patterns:
            pattern = list(cls.RE_BLANK_SPACES.sub("", pseudo_pattern))
            pattern.append("")
            pattern.insert(0, "")
            pattern = (r"(?:\s*(?:" + MARKER_NOISE_START + r")?\s*" + DEBUG_PATTERN + r"*\s*)").join(pattern)
            pattern = (
                r"(?:(\s*C[AÂ]MARA\s*)(DOS\s*)(DEPUTADOS\s*)([0-9][\s0-9]*)?)?" +
                r"(\*" +
                pattern +
                r"\*\s*" +
                r"(?:" + MARKER_NOISE_START + r"\s*" + DEBUG_PATTERN + r"*)?\s*" +
                f"(?:{pattern})?" +
                r")" +
                r"(?:(\s*C[AÂ]MARA\s*)(DOS\s*)(DEPUTADOS))?"
            )
            
            mod_subpattern = (
                f" {MARKER_NOISE_START} " +
                MARKER_INTENDED_CORRUPTION.join([r"\1", r"\2", r"\3", r"\4"]) +
                subpattern.replace(r"\1", r"\5") +
                MARKER_INTENDED_CORRUPTION.join([r"\6", r"\7", r"\8"]) +
                ""
            )
            
            text = regex.sub(pattern, mod_subpattern, text)
        
        return text
    
    @classmethod
    def _detect_preamble_noise(cls, subpattern, text):
        preamble = cls.RE_PREAMBLE.match(text)
    
        if not preamble or not preamble.group(1).strip():
            return text
        
        preamble_content = r"\s*".join(preamble.group(1).split(" "))
        preamble_content = regex.escape(preamble_content)
        text = regex.sub(r"(\s*" + preamble_content + r"[\s\d]*)", subpattern, text)
        return text
    
    @classmethod
    def _detect_repeated_camara(cls, subpattern, text):
        occurrences = cls.RE_CAMARA_REPEATED.findall(text)
        
        if len(occurrences) <= 2:
            return text
        
        mod_subpattern = subpattern.replace(
            r"\1",
            r"\1" +
            MARKER_INTENDED_CORRUPTION +
            r"\2" +
            MARKER_INTENDED_CORRUPTION +
            r"\3"
        )
        
        text = cls.RE_CAMARA_REPEATED.sub(mod_subpattern, text)
        
        return text
    
    @classmethod
    def _detect_repeated_commissions(cls, subpattern, text):
        freqs = collections.Counter(map(str.strip, cls.RE_COMMISSIONS_REPEATED.findall(text)))
        
        for commission_name, freq in freqs.items():
            if freq <= 2:
                continue
            
            mod_subpattern = f" {MARKER_INTENDED_CORRUPTION}".join(cls.RE_BLANK_SPACES.split(commission_name))
            mod_subpattern = subpattern.replace(r"\1", mod_subpattern)
            
            text = text.replace(commission_name, mod_subpattern)
        
        return text
    
    @classmethod
    def sub(cls, subpattern: str, text: str, *args, **kwargs):
        text = cls._detect_barcode(subpattern, text)
        text = cls._detect_preamble_noise(subpattern, text)
        text = cls.RE_SALA_DAS_SESSOES_CODE.sub(subpattern, text)
        text = cls._detect_repeated_camara(subpattern, text)
        text = cls._detect_repeated_commissions(subpattern, text)
        return text


VALID_ROMAN_NUM = r"M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I?X|I?V|V?I{1,3})"
NRO_SMALL = r"[nN]\s*[oO0º°\.]{1,3}"
NRO = r"(?:[nN](?:[uú]me)?(?:ro)?[\.\s]*[oO0º°]|" + NRO_SMALL + r"\s*(?=[0-9]))"
QUOTES = r"”“”\"'‘"
QUOTES_CLASS = f"[{QUOTES}]"
UPPERCASE_LETTERS = r"ÀÁÂÃÇÉÊẼÓÕÔÜÚÍA-Z"
UPPERCASE_LETTERS_OR_NUM = UPPERCASE_LETTERS + r"0-9"

ANNEX = r"(?:Anexo\s*" + f"(?:{VALID_ROMAN_NUM})" + r")"
CABINET = r"(?:Gab(?:inete)?[\.\s]*[0-9]{1,5})"
NOISE_ANNEX_CABINET = (
    r"C[âa]mara\s*dos\s*Deputados\s*.{,20}?\s*" +
    r"(" +
    ANNEX + r"\s*.{,20}?\s*" + CABINET +
    r"|" +
    CABINET + r"\s*.{,20}?\s*" + ANNEX +
    r")"
)


class AgreementList:
    ITEMS = (
        r"(\s*(?:" +
        r"(?:[ÓO]rg[aã]o\s*)?(?:Superior|[cC]oncedente|[cC]onve(?:nente|niada))|" +
        NRO + r"\s*(?:SIAFI|Original|Do\s*conv[eê]nio)|" +
        r"Valor\s*(?:do\s*conv[eê]nio)?|" +
        r"(?:In[ií]cio|Fim)\s*(?:d[ea]\s*vig[eê]ncia)?|" +
        r"Objeto|" +
        r"Conv[eê]nio|" +
        r"Processo|" +
        r"Total\s*de\s*itens\s*Licitados|" +
        r"Fundamento\s*legal|"+
        r"Contratada|" +
        r"Questionamentos|" +
        r"Justificativa"
        r")\s*)"
    )
    RE_ITEMS = regex.compile(ITEMS + r"(\s*:)", regex.IGNORECASE)
    REG_GET_LIST = regex.compile(
        r"(" +
        r"(?:" + ITEMS + r":\s*[^:]{,150}?){2,10}" +
        ITEMS + r":\s*[^:]{,150}" +
        r")",
        regex.IGNORECASE,
    )
    @classmethod
    def sub(cls, subpattern, text: str, *args, **kwargs):
        re_match = cls.REG_GET_LIST.search(text)
        if re_match is None:
            return text
        s_start, s_end = re_match.span()
        text_slice = text[s_start:s_end]
        subpattern = subpattern.replace(r"\1", r"\1" + f" {MARKER_INTENDED_CORRUPTION} " + r"\2")
        text_slice = cls.RE_ITEMS.sub(
            subpattern,
            text_slice,
        )
        text = f"{text[:s_start]}{text_slice}{text[s_end:]}"
        return text




DOC_ABBVR = r"(?:" + "|".join((
    "COM", "DCR", "DEN", "DTQ", "DVS", "DVT", "EMC", "EMD", "EML", "LDO", "EMO",
    "EMP", "EMR", "ERD", "ESB", "EXP", "INA", "INC", "MPV", "MSC", "PAR", "PDC",
    "PEC", "PET", "PFC", "PLP", "PLV", "PRC", "PRF", "PRN", "PRO", "RCP", "REC",
    "REL", "REM", "REP", "REQ", "RIC", "RPR", "SBE", "SBT", "SDL", "LDO", "SIT",
    "TCU", "SOA", "STF", "SUG", "SUM", "CCJ", "TER", "TVR", "VTS", "PL",
)) + r")"


MINISTRIES = "|".join((
    "MAPA",
    "MC",
    "MCTI",
    "MCom",
    "MinC",
    "MD",
    "MDR",
    "ME",
    "MEC",
    "MI",
    "MJSP",
    "MMA",
    "MME",
    "MMFDH",
    "MRE",
    "MS",
    "MTP",
    "MTur",
    "CGU",
    "SeGov",
    "SGPR",
    "CC",
    "GSI",
    "AGU",
    "MAER",
    "MESA",
    "MINTER",
    "MInfra",
    "MPA",
    "MPS",
    "SMPE",
    "SAE",
    "PR",
    "SEPPIR",
    "SNPM",
    "SRI",
    "SNPTA",
    "SAC",
))

RAW_NUMBER_PREFIXES = (
    r"Art(?:igo)?s?\s?\.?\s?|" + NRO_SMALL + r"|\$|p[aá]g\s*\."
)

BASE_LEGAL_ITEMS = (
    r"§\s*[0-9]+",
    r"Art(?:igo)?s?\s*\.?\s*(?:(?:[-–º°0-9]+|[A-Z]{1,2})|\.{3}|[uú]nico)",
    r"(?:\(\s*|\s+|" + QUOTES_CLASS + r")(?:[A-Za-z]|[0-9]{1,2})\s*\)",
    r"(?:par[áa]grafo|§)\s*[úu]nico",
    r"(?:par[áa]grafo|§)\s*[0-9]{1,2}[\so0º°]*:",
    r"(?:sub)?se[çc][ãa]o",
    r"\(?" + f"{VALID_ROMAN_NUM}" + r"\s*(?:[-–\)\.])",
    r"(?<!" + RAW_NUMBER_PREFIXES + r")\(?\s+[0-9]{1,2}[\s0oOº°]*(?:[-–\)]|\.(?![0-9]))",
    r"(?<!" + RAW_NUMBER_PREFIXES + r")\s+[0-9]{1,2}\s*(?:\.[0-9]+){1,2}(?![\.0-9]*,)",
)

MONTHS = "|".join((
    r"[jJ]an(?:eiro)?",
    r"[fF]ev(?:ereiro)",
    r"[mM]ar(?:[cç]o)",
    r"[aA]br(?:il)?",
    r"[mM]ai(?:o)?",
    r"[jJ]un(?:ho)?",
    r"[jJ]ul(?:ho)?",
    r"[aA]go(?:sto)?",
    r"[sS]et(?:embro)?",
    r"[oO]ut(?:ubro)?",
    r"[nN]ov(?:embro)?",
    r"[dD]ez(?:embro)?",
)).upper()

DATE = (
    r"[,\s]*(?:(?:de|em)[,0-9\s]*){1,3}[0-9]{4}|" +
    r"[,\s]*(?:de|em)?\s*[0-9]{,2}\s*(?:de|em)\s*(?:" + MONTHS + r")\s*(?:de|em)\s*[0-9]{4}"
)

DATE_OR_UNDERSCORES = (
    r"[,\s]*(?:(?:de|em)[,\.0-9\s]*){1,3}(?:[0-9]{4}|[\._]+)|" +
    r"[,\s]*(?:de|em)?\s*(?:[0-9]{,2}|[\._]+)\s*(?:de|em)\s*(?:" + MONTHS +
    r"|_+)\s*(?:de|em)\s*(?:[0-9]{4}|[\._]+)"
)

UPPERCASE_DATE_OR_UNDERSCORES = DATE_OR_UNDERSCORES.replace("em", "EM").replace("de", "DE")

EOF = r".{,300}$"

EOF_OR_DATE = (
    r"(?:" +
    EOF +
    r"|" +
    DATE_OR_UNDERSCORES + 
    r")"
)

RE_DOC_CODE_PREFIX = (
    r"(?:" +
    r"030|Daniel|[eE]ss|Jaa|ac[fgp]|afpa|cmrv|(da[-–])?conle|[Cc]rps|" +
    r"dennn?er|dpsl?|drb|epo|faa|‘?[Gg]ab|gsl|jaa|jbs|kvp|lgl|mlcl?|" +
    r"mm|pnf|rpb|tksa|[Vv][Pp][Ll][cf]?|wgl" +
    r")"
)

RE_DOC_CODE_CORE = r"(?:pls|mpv|plc|pec|pds|plv|prn|plp|pdl|tema)"

RE_DOC_CODE_SUFFIX = (
    r"(?:(?:"
    r"c(?:ompleme?ntar)?|eme(?:nda)?s?|" +
    r"rev(?:is)?|sub(?:st\.?(?:itutivo)?)?|sust|tt?" +
    r")\s*?)*"
)

RE_DOC_CODE_FULL = (
    r"(" +
    r"(?<=\s)" +
    RE_DOC_CODE_PREFIX +
    "/" +
    RE_DOC_CODE_CORE +
    "(?:[-–0-9]+)" +
    RE_DOC_CODE_SUFFIX +
    r")"
)

EXTRA_LEGAL_ITEMS = (
    r"Sala\s*d[ea]s?\s*(?:sess|comiss|reuni)(?:[õo]es|[ãa]o)" + EOF_OR_DATE,
    r"Senado\s*Federal\s*," + EOF_OR_DATE,
    r"C[aâ]mara\s*dos\s*Deputados\s*," + EOF_OR_DATE,
    r"Bras[ií]lia\s*,\s*(?:" + DATE_OR_UNDERSCORES + ")\s*",
    r"•",
    "\uF0B7",
    r"As?\s*mesas?\s*da\s*c[aâ]mara\s*dos\s*deputados[^:]{,300}?:",
    r"Atenciosamente\s*,",
)

RE_NOISE_BLOCKS = (
#     regex.compile( #0
#         r"(" +
#         r"(?:" +
#         r"\*[^\*]{12,}\*" +
#         r"\s*".join(["", *"Documentoeletr", "[oô]", *"nico", ""]) +
#         r"\s*.{,400}?\s*" +
#         r")?" +
#         r"(?:(?:PL|PDL|PEC)\s*n[\.o\sº]*[\d\s]+/[\s\d]+)?+\s*" +
#         r"A\s*p\s*r\s*e\s*s\s*e\s*n\s*t\s*a\s*[cç]\s*[aã]\s*o\s*:" +
#         r"(?:\s*\d\s*){2}/(?:\s*\d\s*){2}/(?:\s*\d\s*){6}:(?:\s*\d){2,}" +
#         r"(?:[-–\s]*" + r"\s*".join(["", *"Mesa", ""]) + r")?"
#         r")",
#         regex.IGNORECASE | regex.MULTILINE,
#     ),
    regex.compile(f"({NOISE_ANNEX_CABINET})", regex.IGNORECASE), #1
    regex.compile(f"(?<!{NRO}[_\s\.0-9]*)" + r"([0-9]{11,})"), #2
    regex.compile(r"(_{9,}\s*)+"), #3
    regex.compile(r"(^[\s0-9]+|(?:[0-9]+_+)?[\s0-9]+$)"), #4
    regex.compile( #5
        r"(^(?:\s*[^\s" + "".join(m[0] for m in ALL_SPECIAL_MARKERS) + UPPERCASE_LETTERS_OR_NUM + r"]\s*)+|" +
        r"(?:\s*[^\s\.\)\?" + "".join(m[0] for m in ALL_SPECIAL_MARKERS) + UPPERCASE_LETTERS_OR_NUM + r"]\s*)+$)",
        regex.IGNORECASE,
    ),
    regex.compile(
        r"((?:(?:E[-–\s]*mails?|Endere[cç]os?\s*eletr[oô]nicos?)[\s:]*)?" +
        r"[-–a-zA-Z0-9\._]{,40}\s*@\s*(?:[a-zA-Z]{1,15}\.?){1,3})",
        regex.IGNORECASE,
    ),
    regex.compile(r"(IN\s*C\s*" + NRO + r"\s*[\s0-9]{3,8}/\s*[\s0-9]{4,8})", regex.IGNORECASE),
    *[
        regex.compile(
            r"(?<=[:\?;\." + QUOTES + r"]\s*(?:e|ou)?\s*)([0-9]+)(?=\s*" + legal_item + r")",
            regex.IGNORECASE,
        )
        for legal_item in (*BASE_LEGAL_ITEMS, r"cap[ií]tulo", r"t[íi]tulo")
    ],
    regex.compile(
        r"((?<=C[AÂ]MARA\s*DOS\s*DEPUTADOS\s*)CPI\s*(?:da\s*Petrobr[áa]s)?\s*[-–]\s*" +
        r"(LEI\s*ROUANET|Relat[oó]rio\s*Final|EXPLORA[CÇ][AÃ]O\s*SEXUAL\s*DE\s*CRIAN[CÇ]AS\s*E\s*ADOLESCENTES))",
        regex.IGNORECASE,
    )
)

STANDARD_PREFIXES = (
    r"(?:^|;(?:\s*e|\s*ou)?|[\.:\?]|\(\s*(?:NR|AC|\.{3,})\s*\)\s*|" +
    f"[{QUOTES}])"
)
PREFIX_EXTENSIONS = (
    f"(?:[\s{MARKER_INTENDED_CORRUPTION}]*" +
    MARKER_NOISE_START + r"\s*" + DEBUG_PATTERN +
    r"*.{,300}?" +
    MARKER_NOISE_END + r"\s*" + DEBUG_PATTERN +
    f"*[\s{MARKER_INTENDED_CORRUPTION}]*)"
)

RE_PRE_BLOCKS = tuple(
    regex.compile(f"(?<={STANDARD_PREFIXES}{PREFIX_EXTENSIONS}?)(?=\s*{pattern})", regex.IGNORECASE)
    for pattern in (
        *BASE_LEGAL_ITEMS,
        *EXTRA_LEGAL_ITEMS,
        r"D[eê][-–]se\s*ao\s*Projeto\s*a\s*seguinte\s*reda[cç][aã]o\s*:",
    )
)

ADDITIONAL_TITLES = (
    r"(?:" 
    r"Ju[ií]z[ea]?s?|M[\.\s]*M[aª]?[\s\.]*|" +
    r"Doutor[ea]?s?|D\.?r[aª]?s?[\s\.]*|" +
    r"Professor[ea]?s?|Prof[aª]?s?[\s\.]*|" +
    r"Advogad[ao]s?|Adv[\s\.]*" +
    r")*"
)

ABBVR_EXMO = r"Ex\.?m[aªoº]s?\s*\.?"
ABBVR_EX = r"Ex\.?[aªoº]?s?\s*\.\s*[ºªᵉ]?"
ABBVR_SR = r"S\.?r\.?[aªeᵉ]?s?"

DEPT_EXTENSION_CORE = (
    r"(?:(?:" + ABBVR_SR + r"|Senhor[ea]?s?)?[\s\.]*(?:Deputad[oa]s?|Dep\s*\.)\s*" + ADDITIONAL_TITLES + "|" +
    r"(?:" + ABBVR_SR + r"|Senhor[ea]?s?)[\s\.]*(?:Deputad[oa]s?|Dep\s*\.)?\s*" + ADDITIONAL_TITLES + "|" +
    r"mesa\s*(?:diretora)?|" +
    r"(?:MENSAGEM|" + DOC_ABBVR + ")\s*" + NRO + r"|" +
    r"poder\s*(?:executivo|legislativo|judici[aá]rio)|" +
    r"CPI|" +
    COMMISSIONS +
    r")\s*"
)

DEPT_EXTENSION_A = (
    r"[^\(]{,100}\(\s*(?:D[oa]s?)?\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}\)]" + r"{1,100})?\)"
)

DEPT_EXTENSION_B = (
    r".{,100}?D[oa]s?\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}]" + r"{1,100}" + f"?(?=[{QUOTES}]))?"
)

DEPT_EXTENSION = f"(?:{DEPT_EXTENSION_A}|{DEPT_EXTENSION_B})"

DATE_AND_ID = (
    r"(?:" +
    r"(?:DE\s*)+?[\._0-9]+|" +
    f"(?:{NRO}" + r"[_\s\.0-9]*)?\s*(?:" + UPPERCASE_DATE_OR_UNDERSCORES + r")|" +
    NRO + r"[_\s\.0-9]*" +
    r"(?:[^,]{,30}?[,\.]+\s*(?:DE\s*)+?[\._0-9]+)?" +
    r")"
)
# DATE

fn_lambda_single = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " 
fn_lambda_double = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} "
fn_lambda_triple = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} " + r"\3" + f" {symb} {deb} "
fn_lambda_quad = lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2" + f" {symb} {deb} " + r"\3" + f" {symb} {deb} " + r"\4" + f" {symb} {deb} "

REQUEST_PRESIDENT_OR_MINISTRY = (
    r"(?:Excelent[ií]ssim[oa]s?|" + ABBVR_EXMO + r"|(?:Vossa|V\s*\.)\*(?:excel[eê]ncias?|" + ABBVR_EX + r"))?" +
    r"\s*(?:Senhor[ae]?s?|" + ABBVR_SR + r")[\.\s]*" +
    r"\s*(?:Primeir[oa]s?|Vices?|[-–\s])*" +
    r"(?:Pres(?:id(?:ent[ae])?)?s?|Min(?:istr[oa])?s?|Advogad[ao]s?\s*Geral\s*da\s*Uni[aã]o|Secret[aá]ri[oa]s?)" +
    r"[^,:;\.]{,75}?[,:;\.]"
)

REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED = (
    r"(?:" +
    r"(?<=(?<!" + f"{ABBVR_EXMO}|{ABBVR_EX}|{ABBVR_SR}" + ")\s*\..{,10}?|\).{,10}?)" +
    REQUEST_PRESIDENT_OR_MINISTRY + "|" +
    r"(?:(?<=\.\s*)Requeiro|Solicito)" +
    r")"
)

RE_SPECIAL = (
    (regex.compile( #0
        r"((?:REQUERIMENTO|SOLICITA[CÇ][AÃ]O)\s*DE\s*INFORMA[CÇ](?:[OÕ]ES|[AÃ]O).{,15}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile( #1
        r"((?:REQUERIMENTO|SOLICITA[CÇ][AÃ]O).{,25}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile( #2
        r"((?:REQUERIMENTO|SOLICITA[CÇ][AÃ]O).{,25}?" +
        f"(?:{DATE_AND_ID})?" +
        DEPT_EXTENSION_A +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile( #3
        r"((?:(?:REQUERIMENTO|SOLICITA[CÇ][AÃ]O)\s*DE\s*)?INDICA[CÇ][AÃ]O[^\.]{,20}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile( #4
        r"((?:(?:SUBSTITUTIVO\s*AO\s*)?PROJETO\s*DE\s*)?RESOLU[CÇ][AÃ]O.{,50}?" + 
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)((?:A\s*mesa\s*d)?A\s*C[âa]mara\s*dos\s*deputados[^\.]*?resolve\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile( #5
        r"((?:(?:SUBSTITUTIVO\s*AO\s*)?PROJETO\s*DE\s*)?RESOLU[CÇ][AÃ]O.{,50}?" + 
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        r"(.{,600}?)((?:A\s*mesa\s*d)?A\s*C[âa]mara\s*dos\s*deputados[^\.]*?resolve\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile( #6
        r"(MEDIDA\s*PROVIS[ÓO]RIA.{,50}?" + 
        DATE_AND_ID +
        r")\s*" +
        r"(.{,1200}?)([OA]\s*President[ea]\s*da\s*rep[úu]blica[^:]+?com\s*for[cç]a\s*de\s*lei\s*:)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile( #7
        r"(" +
        r"\s*".join(["", *"Documentoeletr", r"[oô]", *"nico", ""]) +
        r"\s*.{,400}?" +
        r")?" +
        r"(\s*" +
        r"(?:" + DOC_ABBVR + "\s*" + f"(?:{NRO})*" + r"\s*[\d\s]+/[\s\d]+)?+\s*" +
        r"\s*".join(["", *"Apresenta", "[çc]", "[aã]", *"o:", ""]) +
        r"\s*(?:[0-9]\s*){2}" + r"\s*/\s*" +
        r"\s*(?:[0-9]\s*){2}" + r"\s*/\s*" +
        r"\s*(?:[0-9]\s*){4}" + r"\s*" +
        r"\s*(?:[0-9]\s*){2}" + r"\s*:\s*" +
        r"\s*(?:" +
        f"\s*(?:{MARKER_NOISE_START}\s*{DEBUG_PATTERN}*)?\s*" +
        r"[0-9]" +
        f"\s*(?:{MARKER_NOISE_END}\s*{DEBUG_PATTERN}*)?\s*" +
        r"\s*){2}" +
        r"\s*)" +
        r"([-–]*)(" + r"\s*".join(["", *"Mesa", ""]) + r")?", regex.IGNORECASE | regex.MULTILINE),
    lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1\2" + MARKER_INTENDED_CORRUPTION + r"\3" + MARKER_INTENDED_CORRUPTION + r"\4" + f" {symb_end} {deb} ", None),
    (DetectRecurrentNoise, #8
     lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1" + f" {symb_end} {deb} ", None),
    (regex.compile( #9
        r"([:;" + QUOTES + r"\?])(\s{,10}[-–])"),
     lambda symb, deb: r"\1" + f" {symb} {deb} " + r"\2", None),
    (regex.compile( #10
        r"((?<!S\s*\.\s*A\s*)\.)(\s{,10}[-–])(?=\s*[" + UPPERCASE_LETTERS + "])"),
     lambda symb, deb: r"\1" + f" {symb} {deb} " + r"\2", None),
    (regex.compile( #11
        r"(?<=,\s*(?:e|ou)\s*)" + f"(?={BASE_LEGAL_ITEMS[2]})"),
     lambda symb, deb: f" {symb} {deb} ", None),
    (regex.compile( #12
        r"(EMI\s*" + DATE_AND_ID + r"\s*[0-9][0-9\s]*" + f"(?:(?:{MINISTRIES})/?)+" + r")"
        r"(\s*[^,]{,50}?,\s*(?:" + DATE + r")[\.\s]*)?"
    ),
    fn_lambda_double, None),
    (regex.compile( #13
        r"((?:TVR|(?:Ato\s*de\s*)?Concess[aã]o(?:e|\s)*Renova[cç][ãa]o(?:de|\s)*Concess[aã]o(?:de|\s)*Emissora(?:de|\s)*Rádio(?:e|de|\s)*Televisão)\s*" + DATE_AND_ID + DEPT_EXTENSION + ")"
        r"\s*((?:mensagem|msc[\s\.]*)\s*" + NRO + "[_\.0-9\s]+/\s*[0-9]{4})" +
        r"\s*((?:aviso|av[\s\.]*)\s*" + NRO + "[_\.0-9\s]+/\s*[0-9]{4}" +
        r"(?:\s*[-–]\s*C\s*\.\s*Civil)?)", regex.IGNORECASE),
    fn_lambda_triple, None),
    (regex.compile( #14
        r"((?:SUBSTITUTIVO\s*AO\s*)?PROJETO\s*DE)(\s*" +
        r"(?:" +
        r"LEI(?:\s*COMPLEMENTAR)?|" +
        r"DECRETO\s*LEGISLATIVO|" +
        r"RESOLU[ÇC][AÃ]O|" +
        r"EMENDA\s*CONSTITUICIONAL|" +
        r"EMENDA\s*[AÁÀ]\s*CONSTITUI[CÇ][AÃ]O|" +
        r"MEDIDA\s*PROVIS[OÓ]RIA"
        r")\s*" +
        f"(?:{NRO}[_\s\.0-9]*)?" +
        f"(?!{DEPT_EXTENSION})"
        r"\s*[\s" + UPPERCASE_LETTERS_OR_NUM + r"]{,150}?" +
        r"(?=(?:[OA]\s+)?[\." + UPPERCASE_LETTERS + "][a-z])" +
        r")"),
    lambda symb, deb: f" {symb} {deb} " + MARKER_INTENDED_CORRUPTION + r"\1" + MARKER_INTENDED_CORRUPTION + r"\2" + f" {symb} {deb} ", None
    ),
    (regex.compile( #15
        r"((?:SUBSTITUTIVO\s*AO\s*)?PROJETO\s*DE)(\s*" +
        r"(?:" +
        r"LEI(?:\s*COMPLEMENTAR\s*|\s*DA\s*C[AÂ]MARA\s*)*|" +
        r"DECRETO\s*LEGISLATIVO|" +
        r"RESOLU[ÇC][AÃ]O|" +
        r"EMENDA\s*CONSTITUICIONAL|" +
        r"EMENDA\s*[AÁÀ]\s*CONSTITUI[CÇ][AÃ]O|" +
        r"MEDIDA\s*PROVIS[OÓ]RIA"
        r")\s*" +
        f"(?:{NRO}[_\s\.0-9]*)" +
        f"(?:{UPPERCASE_DATE_OR_UNDERSCORES})"
        r"\s*[\s" + UPPERCASE_LETTERS_OR_NUM + r"]{,150}?" +
        r"(?=(?:[OA]\s+)?[\." + UPPERCASE_LETTERS + "][a-z])" +
        r")"),
    lambda symb, deb: f" {symb} {deb} " + MARKER_INTENDED_CORRUPTION + r"\1" + MARKER_INTENDED_CORRUPTION + r"\2" + f" {symb} {deb} ", None
    ),
    (regex.compile( #16
        r"(?<=[" + UPPERCASE_LETTERS + "]{3,}\s+)([0-9]{1,2}\s*\.\s+[0-9]+)"),
    lambda symb, deb: f" {symb} {deb} " + r"\1", None
    ),
    (regex.compile( #17
        r"(?<=\s|^)(\s*(?:(?:Tel(?:efone)?s?|Fones?|Fax(?:es)?)[\.\s:]*|ou|,)\s*)" +
        r"(?:(\()(\s*[0-9]{2,}\s*)(\)))?(\s*[0-9]{4,}\s*[-–\.\s]?\s*[0-9]{4,}(?:\s*/\s*[0-9]{4})?)",
        regex.IGNORECASE),
    lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1\2" + MARKER_INTENDED_CORRUPTION + r"\3\4" + MARKER_INTENDED_CORRUPTION + r"\5" + f" {symb_end} {deb} ", None),
    (regex.compile( #18
        r"(?<=\s|^)(Bras[ií]lia[-–/\s]*DF.{,10}?)?(CEP[\.\s:]*[0-9]{2}[\s\.]*[0-9]{3}\s*[-–]\s*[0-9]{3})(.{,10}?Bras[ií]lia[-–/\s]*DF)?"),
    lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1\2\3" + f" {symb_end} {deb} ",
    None,
    ),
    (regex.compile( #19
        r"(PROPOSTA\s*DE\s*FISCALIZA[CÇ][AÃ]O\s*E\s*CONTROLE[^\.]{,20}?" +
        f"\s*(?:{DATE_AND_ID})?\s*" +
        f"\s*(?:{DEPT_EXTENSION})\s*" +
        r")\s*" +
        r"(.{,600}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE),
     fn_lambda_double, None),
    (regex.compile( #20
        r"(OF[IÍ]CIO\s*" + NRO + r".{,110}?\s*)" +
        r"((?:Bras[ií]lia|Senado\s*Federal)?[,\s]*(?:" + DATE_OR_UNDERSCORES + r")[\.\s]*)" +
        r"(.{,300}?\s*)" +
        r"(Assunto\s*:\s*.{,300}?)" +
        f"(?={REQUEST_PRESIDENT_OR_MINISTRY_AFFIXED})", regex.IGNORECASE
    ),
    fn_lambda_quad, None),
    (regex.compile( #21
        r"(Atenciosamente\s*)," +
        r"(\s*.{,250}?" +
        RE_DOC_CODE_FULL +
        r")", regex.IGNORECASE),
    lambda symb, deb: f" {symb} {deb} " + MARKER_INTENDED_CORRUPTION + r"\1" + MARKER_INTENDED_CORRUPTION + r",\2" + f" {symb} {deb} ", None),
    (regex.compile( #22
        r"((?:REQUERIMENTO|SOLICITA[CÇ][AÃ]O)\s*DE\s*INFORMA[CÇ](?:[OÕ]ES|[AÃ]O).{,10}?" +
        DATE_AND_ID +
        f"(?:{DEPT_EXTENSION})?" +
        r"\s*)" +
        r"(.{,600}?)" +
        r"([ÀÁA]\s*sua\s*excel[eê]ncia.{,100}?)" +
        r"(?=(?:" + REQUEST_PRESIDENT_OR_MINISTRY + "[,\s]*)?(?:Requeiro|Solicito))", regex.IGNORECASE),
     fn_lambda_triple, None),
    (regex.compile( # 23
        f"(?<={MARKER_NOISE_END}\s*{DEBUG_PATTERN}*)" +
        r"(\s*)([^\s" + UPPERCASE_LETTERS + r"])((?:\s|\2)*)(\s*)" +
        f"(?={MARKER_NOISE_START}\s*{DEBUG_PATTERN}*)",
        regex.IGNORECASE),
     lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1\2\3\4" + f" {symb_end} {deb} ", None),
    (regex.compile( #24
        r"(Autora?\s*:\s*.{,200}?)(\s*Relatora?\s*:)", regex.IGNORECASE),
     lambda symb, deb: f" {symb} {deb} " + r"\1" + f" {symb} {deb} " + r"\2", None),
    (regex.compile( #25
        r"(?<=(?:Relatora?|Autora?)\s*:.{,200}?\s+)(" + VALID_ROMAN_NUM + r"[-–\s]+RELAT[OÓ]RIO\s+)", regex.IGNORECASE),
    lambda symb, deb: f" {symb} {deb} " + r"\1", None),
    (AgreementList, #26
    lambda symb, deb: f" {symb} {deb} " + r"\1", None),
    (regex.compile(r"(?=Reiterando\s*os\s*votos\s*de\s*apre[cç]o\s*e\s*considera[cç][aã]o)", regex.IGNORECASE),
    lambda symb, deb: f" {symb} {deb} ", None),
    (regex.compile( #27
        r"(?<=\s|^)(\s*(?:(?:Tel(?:efone)?s?|Fones?|Fax(?:es)?)[\.\s:]*)\s*)?" +
        r"(?:(\()(\s*[0-9]{2,}\s*)(\)))(\s*[0-9]{4,}\s*[-–\.\s]?\s*[0-9]{4,}(?:\s*/\s*[0-9]{4})?)",
        regex.IGNORECASE),
    lambda symb_start, symb_end, deb: f" {symb_start} {deb} " + r"\1\2" + MARKER_INTENDED_CORRUPTION + r"\3\4" + MARKER_INTENDED_CORRUPTION + r"\5" + f" {symb_end} {deb} ", None),
)

LARGER_BLOCKS_HIERARCHY = (
    "LIVRO",
    "T[IÍ]TULO",
    "CAP[IÍ]TULO",
    "(?:Sub)?[sS]e[cç][aã]o",
    BASE_LEGAL_ITEMS[1] + r"(?=\s*[^" + UPPERCASE_LETTERS_OR_NUM + r"])",
)

RE_PRE_POST_BLOCKS = (
    regex.compile( #0
        r"(ACORDO\s*DE\s*[-,"
        + UPPERCASE_LETTERS_OR_NUM
        + r"\s]+)(?=(?:[OA]\s+)?["
        + UPPERCASE_LETTERS
        + r"][a-z])"
    ),
    regex.compile(r"(?<!\(.{,50}?)(" + COMMISSIONS + ")"), #1
    regex.compile(r"(O\s*Congresso\s*Nacional\s*decreta\s*.{,40}?\s*:)", regex.IGNORECASE), #2
    regex.compile(r"(C[ÂA]MARA\s*DOS\s*DEPUTADOS|CONGRESSO\s*NACIONAL)(?!\s*[dD][eE][cC][rR][eE][tT][aA])"), #3
    regex.compile(r"(A\s*C[aâ]mara\s*dos\s+deputados\s*decreta\s*.{,40}?\s*:)", regex.IGNORECASE), #4
    regex.compile( #5
        r"((?:SUBSTITUTIVO\s*AO\s*)?Projeto\s*de\s*Lei\s*(?:\s*COMPLEMENTAR\s*|\s*DA\s*C[AÂ]MARA\s*)*\s*" +
        f"(?:{DATE_AND_ID})?" + r"\s*"+ DEPT_EXTENSION + r")",
        regex.IGNORECASE,
    ),
    regex.compile( #6
        r"((?:SUBSTITUTIVO\s*AO\s*)?Projeto\s*de\s*(?:Decreto\s*Legislativo|Resolu[cç][aã]o)\s*" +
        f"(?:{DEPT_EXTENSION}|{DATE_AND_ID})" +
        r")",
        regex.IGNORECASE,
    ),
    regex.compile( #7
        r"(?<=^[^\(]{,500}?)(Mensagem\s*" + DATE_AND_ID + "\s*[0-9][0-9\s]*)",
        regex.IGNORECASE,
    ),
    regex.compile( #8
        r"((?:SUBSTITUTIV[AO]\s*[ÁÀA]\s*)?" +
        r"Proposta\s*de\s*emenda\s*(?:cons?titucional|[aàá]\s*constitui[çc][ãa]o).*?" +
        f"(?:{DEPT_EXTENSION})" +
        r")",
        regex.IGNORECASE,
    ),
    *[ #9, 10, 11
        regex.compile(
            f"({LARGER_BLOCKS_HIERARCHY[i]}" + r"\s*" + f"(?:{VALID_ROMAN_NUM})" +
            r"(?:[-–\.\s" + UPPERCASE_LETTERS_OR_NUM + r"]|" + 
            f"{MARKER_NOISE_END}|{MARKER_NOISE_START}" +
            r")+?" +
            f"(?={MARKER_VALID}|" + r"|".join(LARGER_BLOCKS_HIERARCHY[i + 1:]) + r"))",
            regex.IGNORECASE,
        )
        for i in range(len(LARGER_BLOCKS_HIERARCHY) - 1)
    ],
    regex.compile( #12
        r"(Art.{,10}?Esta\s*" +
        r"(?:lei|EC|Emenda\s*(?:Constitucional|[àaá\s]*constitui[cç][aã]o))\s*" +
        r"entr[ea]\s*em\s*vigor\s*na\s*(?:data\s*de\s*)sua\s*publica[cç][aã]o\s*.{,50}?(?:\.|$))",
        regex.IGNORECASE,
    ),
)

RE_POST_BLOCKS = tuple(
    regex.compile(f"(?<={pattern})", regex.IGNORECASE)
    for pattern in []
)

def regex_legal_item_anymatch(text: str, debug: bool = False) -> str:
    aid = 0
    
    for i, reg in enumerate(RE_NOISE_BLOCKS, aid):
        debug_text = f"{i}_NOISE" if debug else ""
        text = reg.sub(f" {MARKER_NOISE_START} {debug_text} " + r"\1" + f" {MARKER_NOISE_END} {debug_text} ", text, concurrent=True)
    
    for i, (reg, fun, fun_post) in enumerate(RE_SPECIAL, aid):
        debug_text = f"{i}_SPECIAL" if debug else ""
        try:
            pat = fun(MARKER_VALID, debug_text)
            
        except TypeError:
            pat = fun(MARKER_NOISE_START, MARKER_NOISE_END, debug_text)
            
        text = reg.sub(pat, text, concurrent=True)
        
        if fun_post is not None:
            text = fun_post(text)
        
    for i, reg in enumerate(RE_PRE_BLOCKS, aid):
        debug_text = f"{i}_PRE" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    for i, reg in enumerate(RE_POST_BLOCKS, aid):
        debug_text = f"{i}_POS" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    for i, reg in enumerate(RE_PRE_POST_BLOCKS, aid):
        debug_text = f"{i}_PRE_POS" if debug else ""
        text = reg.sub(f" {MARKER_VALID} {debug_text} " + r"\1" + f" {MARKER_VALID} {debug_text} ", text, concurrent=True)
        
    return text


def preprocess_instance(item, ind, print_preprocessed: bool = False, debug: bool = False):    
    preprocessed_text = seg_model.preprocess_legal_text(item["text"])
    preprocessed_text = regex_legal_item_anymatch(preprocessed_text, debug=debug)
    preprocessed_text = preprocessed_text.replace(MARKER_INTENDED_CORRUPTION, "@" if debug else "")
    tokens = nltk.tokenize.word_tokenize(preprocessed_text, language="portuguese")
    
    if print_preprocessed:
        print(colorama.Fore.WHITE, colorama.Style.DIM, preprocessed_text, colorama.Style.RESET_ALL, sep="")
    
    labels = [0] * len(tokens)
    
    i = 0
    while i < len(tokens) - 1:
        if tokens[i] in SPECIAL_SYMBOLS:
            cur_token = tokens.pop(i)
            cur_label = labels.pop(i)
            
            if cur_label == SPECIAL_SYMBOLS[MARKER_VALID] and cur_token == MARKER_NOISE_START:
                labels[i] = SPECIAL_SYMBOLS[MARKER_VALID]
                if tokens[i + 1] != MARKER_NOISE_END:
                    labels[i + 1] = SPECIAL_SYMBOLS[MARKER_NOISE_START]
                continue
            
            if cur_label == SPECIAL_SYMBOLS[MARKER_VALID] and cur_token == MARKER_NOISE_END:
                labels[i] = SPECIAL_SYMBOLS[MARKER_VALID]
                continue
            
            if cur_label == SPECIAL_SYMBOLS[MARKER_NOISE_END] and cur_token == MARKER_NOISE_START:
                labels[i] = 0
            else:
                labels[i] = SPECIAL_SYMBOLS[cur_token]
                
            continue
            
        i += 1
    
    if labels:
        maybe_erase_pool = []
        noise_on = False

        for i in range(len(labels) - 1):
            if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END] and labels[i + 1] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
                labels[i] = labels[i + 1] = 0

        for i in range(len(labels)):
            if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
                maybe_erase_pool.clear()
                continue

            if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END]:
                while maybe_erase_pool:
                    ind = maybe_erase_pool.pop()
                    labels[ind] = 0

            if labels[i] > 0:
                maybe_erase_pool.append(i)

        for i in range(len(labels)):
            if labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_START]:
                if noise_on:
                    labels[i] = 0
                else:
                    noise_on = True

            elif labels[i] == SPECIAL_SYMBOLS[MARKER_NOISE_END]:
                if noise_on:
                    noise_on = False
                else:
                    labels[i] = 0

            elif labels[i] == SPECIAL_SYMBOLS[MARKER_VALID]:
                noise_on = False
            
        while tokens and tokens[0] in SPECIAL_SYMBOLS:
            labels.pop(0)
            tokens.pop(0)

        while tokens and tokens[-1] in SPECIAL_SYMBOLS:
            labels.pop()
            tokens.pop()
        
        if labels[0] == SPECIAL_SYMBOLS[MARKER_VALID]:
            labels[0] = 0
            
    ret = {
        "id": str(ind),
        "labels": labels,
        "tokens": tokens,
    }
    
    return ret

if DEV_RUN:
    auxaux = """
Senhor Presidente,  
 Nos termos do art. 113, inciso I e parágrafo 1º, do Regimento Interno da Câmara 
dos Deputados, requeiro o envio de Indicação ao Poder Executivo, por meio do Ministro 
da Economia, Paulo Guedes, sugerindo a isenção do Imposto de Importação (II), 
incidente sobre a comercialização de dispositivos protéticos necessários à reabilitação 
de pessoas amputadas.  *C D2 02 64 89 71 10 0* Do cu m en to e le tr ôn ic o as sin ad o po r W ol ne y Q ue iro z (P DT /P E) , a tr av és d o po nt o SD R_ 56 16 4, na fo rm a do a rt . 1 02 , § 1 º, d o RI CD c /c o a rt . 2 º, d o At o da M es a n. 8 0 de 2 01 6. PL n .4 92 3/ 20 20 Ap re se nt aç ão : 1 4/ 10 /2 02 0 16 :5 7 - M es a
 
Sala das Sessões, em ___ de ____________ de 2020. 
""".replace(" , ", ", ").replace(" . ", ". ")

_=preprocess_instance({"text": auxaux}, -1, True, True)

[37m[2mSenhor Presidente, Nos termos do art. 113, inciso I e parágrafo 1º, do Regimento Interno da Câmara dos Deputados, requeiro o envio de Indicação ao Poder Executivo, por meio do Ministro da Economia, Paulo Guedes, sugerindo a isenção do Imposto de Importação (II), incidente sobre a comercialização de dispositivos protéticos necessários à reabilitação de pessoas amputadas.  ❌s__ @@@ ❌s__ 8_SPECIAL *C D2 02 64 89 71 10 0* ❌s__ 7_SPECIAL   ❌e__ 8_SPECIAL @@Do cu m en to e le tr ôn ic o as sin ad o po r W ol ne y Q ue iro z (P DT /P E), a tr av és d o po nt o SD R_ 56 16 4, na fo rm a do a rt. 1 02, § 1 º, d o RI CD c /c o a rt. 2 º, d o At o da M es a n. 8 0 de 2 01 6. PL n .4 92 3/ 20 20 Ap re se nt aç ão : 1 4/ 10 /2 02 0 16 : ❌s__ 14_NOISE 5 ❌e__ 14_NOISE  7 @-@ M es a  ❌e__ 7_SPECIAL ✓ 9_PRE   ✓ 9_PRE Sala das Sessões, em ___ de  ❌s__ 2_NOISE ____________  ❌e__ 2_NOISE de 2020.[0m


In [209]:
df = datasets.load_dataset(
    "csv",
    data_files=["../data/content.txt"],
    header=None,
    names=["text"],
    cache_dir="../cache/datasets",
    skiprows=DATASET_ROW_START if DEV_RUN else None,
    nrows=(DATASET_ROW_END - DATASET_ROW_START + 1) if DEV_RUN else None,
)

RE_JUSTIFICATIVA = regex.compile(
    r"\s*(?:" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A\s*T\s*I\s*V\s*A|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*t\s*i\s*v\s*a\s+(?=[" + UPPERCASE_LETTERS + r"])|" +
    r"J\s*U\s*S\s*T\s*I\s*F\s*I\s*C\s*A\s*[CÇ]\s*[AÂÃÀÁ]\s*O|" +
    r"J\s*u\s*s\s*t\s*i\s*f\s*i\s*c\s*a\s*[cç]\s*[aãâàá]\s*o\s+(?=[" + UPPERCASE_LETTERS + r"])" +
    r")"
)

RE_ANEXO = regex.compile(r"\s*A\s*N\s*E\s*X\s*O")

df = df.filter(lambda item: isinstance(item["text"], str) and 128 <= len(item["text"]) <= 600000)
df = df.map(lambda item: {"text": RE_JUSTIFICATIVA.split(item["text"])[0]})
df = df.map(lambda item: {"text": RE_ANEXO.split(item["text"])[0]})

df = df.map(preprocess_instance, with_indices=True, num_proc=10, remove_columns=None if DEV_RUN else "text")

rerun_tests = True

Using custom data configuration default-46b3e822caba5a99
Reusing dataset csv (../cache/datasets/csv/default-46b3e822caba5a99/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at ../cache/datasets/csv/default-46b3e822caba5a99/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-8aa3bb54f5af798d.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-46b3e822caba5a99/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-40495513c975b77e.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-46b3e822caba5a99/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-9599182709e5e523.arrow


In [210]:
print(df.num_rows)

{'train': 10000}


In [214]:
if DEV_RUN:
    id_ = None
    
    if id_ is not None:
        print(tests.TEST_CASES[id_])
        tests.print_results(df, id_, print_full_text=True)
#         tests.update_test_case(id_, (10, 0))
    
    if rerun_tests:
        try:
            tests.run_tests(df["train"]["labels"])
            rerun_tests = False

        except AssertionError as e:
            raise AssertionError from e

        print("\n\n")

    document_idx = min(9999, df["train"].num_rows - 1)
    while tests.test_case_exists(document_idx):
        document_idx = random.randint(0, df["train"].num_rows)

    print(colorama.Fore.YELLOW + "Chosen id:" + colorama.Fore.RESET, document_idx, end="\n\n")

    expected_test_case_values = tests.print_results(df, document_idx, print_full_text=True)
    print("Is it correct? [y/N]:", end=" ")
    inp = input()
    if inp == "y":
        tests.update_test_case(document_idx, expected_test_case_values)
        print("Added to test cases.")

[33mChosen id:[39m 4145

[37m[2m 
PROJETO DE LEI Nº          , DE 2019 
(Da Sra. ANGELA AMIN) 
Altera o art. 473 da Consolidação das 
Leis do Trabalho (CLT), para permitir que o 
empregado deixe de comparecer ao serviço, 
sem prejuízo da remuneração, quando 
estiver participando do programa oferecido 
pela Justiça da Infância e da Juventude aos 
postulantes à adoção. 
O Congresso Nacional decreta: 
Art. 1º O art. 473 da Consolidação das Leis do Trabalho (CLT), 
aprovada pelo Decreto-lei nº 5.452, de 1º de maio de 1943, passa a vigorar 
com a seguinte alteração: 
“Art. 473........................................................................................ 
...................................................................................................... 
XIII – pelo tempo que se fizer necessário, quando estiver 
participando, na condição de postulante à adoção, do programa 
oferecido pela Justiça da Infância e da Juventude nos termos 
do § 1º do art. 197-C da Lei nº 8.069, d

In [215]:
if DEV_RUN:
    tests.dump_registered_cases(test_cases_uri=TEST_CASE_URI)

Wrote 17 test cases at './test_cases/100001_110000_registered_test_cases.csv'.


In [None]:
df["train"]["labels"]

In [None]:
def tokenize_and_align_labels(examples):
    # source: https://huggingface.co/docs/transformers/custom_datasets#preprocess
    tokenized_inputs = seg_model.tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )

    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs


df_tokenized = df.map(tokenize_and_align_labels, batched=True, num_proc=4)

In [None]:
df_tokenized_train_eval_test = df_tokenized["train"].train_test_split(test_size=0.2, shuffle=True, seed=16)
df_tokenized_test_eval = df_tokenized_train_eval_test["test"].train_test_split(test_size=0.5, shuffle=False)
df_tokenized_split = datasets.DatasetDict({
    "train": df_tokenized_train_eval_test["train"],
    "eval": df_tokenized_test_eval["train"],
    "test": df_tokenized_test_eval["test"],
})
# df_tokenized_split.save_to_disk("../data/df_tokenized_split")
df_tokenized_split

In [None]:
df_tokenized_split["train"].features

In [None]:
print(df["train"]["labels"][49])