In [131]:
import transformers
import datasets
import nltk
import tokenizers
import regex

import segmentador

In [2]:
seg_model = segmentador.Segmenter(local_files_only=True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model check

In [329]:
RE_NOISE_BLOCKS = (
    regex.compile(
        r"((PL|PDL|PEC)\s*n[\.o\sº]*[\d\s]+/[\s\d]+)?+\s*"
        r"A\s*p\s*r\s*e\s*s\s*e\s*n\s*t\s*a\s*[cç]\s*[aã]\s*o\s*:"
        r"(\s*\d\s*){2}/(\s*\d\s*){2}/(\s*\d\s*){6}:(\s*\d){2}",
        regex.IGNORECASE | regex.MULTILINE,
    ),
    regex.compile(r"([0-9]{9,})"),
)

QUOTES = r"”“\"'"

STANDARD_PREFIXES = (
    r"(?<=^|;(?:\s*e|\s*ou)?|[\.:\?]|\(\s*NR\s*\)|" +
    f"[{QUOTES}]|" + 
    "|".join(reg.pattern for reg in RE_NOISE_BLOCKS) +
    ")"
)

RE_PRE_BLOCKS = tuple(
    regex.compile(f"{STANDARD_PREFIXES}(?=\s*{pattern})", regex.IGNORECASE)
    for pattern in [
        r"§\s*[0-9]+",
        r"Art(?:igo)?s?\s*\.?\s*(?:[-–0-9A-Z]+|\.{3}|[uú]nico)",
        r"(?: [A-Za-z]|[0-9]{1,2})\s*\)",
        r"par[áa]grafo\s*[úu]nico",
        r"cap[ií]tulo",
        r"(?:sub)?se[çc][ãa]o",
        r"\(?M{0,3}(?:C[MD]|D?C{0,3})(?:X[CL]|L?X{0,3})(?:I?X|I?V|V?I{1,3})\s*(?:–|-|\))",
        r"\(?\s+[0-9]{1,2}[\sº]*[-–\)\.]",
    ]
)

DEPT_EXTENSION_CORE = (
    r"(?:(?:Sra?|Senhora?)?[\s\.]*(?:Deputad[oa]|Dep.)|" +
    r"(?:Sra?|Senhora?)[\s\.]*(?:Deputad[oa]|Dep.)?)\s*"
)

DEPT_EXTENSION_A = (
    r"[^\(]*?\(\s*D[oa]\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}\)]+?\))?"
)

DEPT_EXTENSION_B = (
    r".*?D[oa]\s*" +
    DEPT_EXTENSION_CORE +
    f"(?:[^{QUOTES}]+?(?=[{QUOTES}]))?"
)

DEPT_EXTENSION = f"(?:{DEPT_EXTENSION_A}|{DEPT_EXTENSION_B})"

RE_SPECIAL = (
    (regex.compile(
        r"(REQUERIMENTO\s*DE\s*INFORMA[cÇ][oÕ]ES.*?(?:DE\s*[\.0-9]+|N[\.\s]*[oº](?:[^,]*?,\s*DE\s*[\.0-9]+)?)" +
        f"(?:{DEPT_EXTENSION})?" +
        r")\s*" +
        "(.+?)(?=(Excelent[ií]ssim[oa])?\s*(?:Senhora?|Sra?)[\.\s*]Presidente)", regex.IGNORECASE),
     lambda symb: f" {symb} " + r"\1" + f" {symb} " + r"\2" + f" {symb} "),
)

RE_PRE_POST_BLOCKS = (
    regex.compile(r"(O\s+Congresso\s+Nacional\s+decreta:)", regex.IGNORECASE),
    regex.compile(r"(Projeto\s*de\s*Lei" + DEPT_EXTENSION + ")", regex.IGNORECASE),
)

MARKER_VALID = "\u2713"
MARKER_NOISE_START = "\u274Cs__"
MARKER_NOISE_END = "\u274Ce__"
SPECIAL_SYMBOLS = {
    MARKER_VALID: 1,
    MARKER_NOISE_START: 2,
    MARKER_NOISE_END: 3,
}

print("Marker symbol (valid):", MARKER_VALID)
print("Marker symbol (noise):", MARKER_NOISE_START, MARKER_NOISE_END)

def regex_legal_item_anymatch(text: str) -> str:
    for reg in RE_PRE_BLOCKS:
        text = reg.sub(f" {MARKER_VALID} ", text, concurrent=True)
    
    for reg in RE_NOISE_BLOCKS:
        text = reg.sub(f" {MARKER_NOISE_START} " + r"\1" + f" {MARKER_NOISE_END} ", text, concurrent=True)
        
    for reg in RE_PRE_POST_BLOCKS:
        text = reg.sub(f" {MARKER_VALID} " + r"\1" + f" {MARKER_VALID} ", text, concurrent=True)
    
    for reg, fun in RE_SPECIAL:
        text = reg.sub(fun(MARKER_VALID), text, concurrent=True)
        
    return text

Marker symbol (valid): ✓
Marker symbol (noise): ❌s__ ❌e__


In [336]:
df = datasets.load_dataset(
    "csv",
    data_files=["../data/content.txt"],
    header=None,
    names=["text"],
    cache_dir="../cache/datasets",
    nrows=10010,
)

RE_JUSTIFICATIVA = regex.compile(r"\s*JUSTIFICATIVA")

df = df.filter(lambda item: isinstance(item["text"], str) and len(item["text"]) >= 128)
df = df.filter(lambda item: "JUSTIFICATIVA" in item["text"])
df = df.map(lambda item: {"text": RE_JUSTIFICATIVA.split(item["text"])[0]})

# df = df.filter(lambda item: RE_SPECIAL[0][0].search(seg_model.preprocess_legal_text(item["text"])) is not None)

# tokenizers.pre_tokenizers.Sequence([
#     tokenizers.pre_tokenizers.Whitespace(),
#     tokenizers.pre_tokenizers.Punctuation(),
# ])

def preprocess_instance(item, ind):
    preprocessed_text = seg_model.preprocess_legal_text(item["text"])
    preprocessed_text = regex_legal_item_anymatch(preprocessed_text)
    tokens = nltk.tokenize.word_tokenize(preprocessed_text, language="portuguese")
    
    labels = [0] * len(tokens)
    
    i = 0
    while i < len(tokens) - 1:
        if tokens[i] in SPECIAL_SYMBOLS:
            token = tokens.pop(i)
            labels.pop(i)
            labels[i] = SPECIAL_SYMBOLS[token]
            continue
            
        i += 1

    ret = {
        "id": str(ind),
        "labels": labels,
        "tokens": tokens,
    }
    
    return ret

df = df.map(preprocess_instance, with_indices=True, num_proc=1, remove_columns="text")

Using custom data configuration default-998d1132630bd9b4
Reusing dataset csv (../cache/datasets/csv/default-998d1132630bd9b4/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at ../cache/datasets/csv/default-998d1132630bd9b4/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-e128a25163b7a735.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-998d1132630bd9b4/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-7f360c383a5b8cd9.arrow
Loading cached processed dataset at ../cache/datasets/csv/default-998d1132630bd9b4/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e/cache-4fbd80db7c82a144.arrow


0ex [00:00, ?ex/s]

In [338]:
def print_results(df, id_):
    tokens = df["train"][id_]["tokens"]
    labels = df["train"][id_]["labels"]
    
    print(" ".join(df["train"][id_]["tokens"]))
    print()
    
    sentence = []
    
    for tok, lab in zip(tokens, labels):
        if lab == SPECIAL_SYMBOLS[MARKER_VALID]:
            print(" ".join(sentence), end="\n\n")
            sentence = []

        sentence.append(
            f"@@{tok}@@"
            if (lab == SPECIAL_SYMBOLS[MARKER_NOISE_START] or lab == SPECIAL_SYMBOLS[MARKER_NOISE_END])
            else tok
        )
    
    print(" ".join(sentence), end="\n\n")
            
print_results(df, 49)

PROJETO DE LEI COMPLEMENTAR Nº , DE 2002 ( Da Sra. Tânia Soares ) Altera a Lei Complementar nº 87 , de 13 de setembro de 1996 , que dispõe sobre o imposto dos Estados e do Distrito Federal sobre operações relativas à circulação de mercadorias e sobre prestações de serviços de transporte interestadual e intermunicipal e de comunicação . O Congresso Nacional decreta : Art . 1º O parágrafo 5º do artigo 8º da Lei Complementar nº 87 , de 13 de setembro de 1996 passa a vigorar com a seguinte redação , incluindo nele , ainda , um novo parágrafo 6º : “ § 5º O imposto a ser pago por substituição tributária , na hipótese do inciso II do caput , corresponderá à diferença entre o valor resultante da aplicação da alíquota prevista para as operações ou prestações internas do Estado de destino sobre a respectiva base de cálculo e o valor do imposto devido pela operação ou prestação própria do substituto , deduzindo-se dessa diferença um desconto igual à taxa de encargos de atualização aplicável ao pa

In [4]:
def tokenize_and_align_labels(examples):
    # source: https://huggingface.co/docs/transformers/custom_datasets#preprocess
    tokenized_inputs = seg_model.tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=512,
        is_split_into_words=True,
    )

    labels = []
    
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs


df_tokenized = df.map(tokenize_and_align_labels, batched=True, num_proc=4)

In [5]:
df_tokenized_train_eval_test = df_tokenized["train"].train_test_split(test_size=0.2, shuffle=True, seed=16)
df_tokenized_test_eval = df_tokenized_train_eval_test["test"].train_test_split(test_size=0.5, shuffle=False)
df_tokenized_split = datasets.DatasetDict({
    "train": df_tokenized_train_eval_test["train"],
    "eval": df_tokenized_test_eval["train"],
    "test": df_tokenized_test_eval["test"],
})
# df_tokenized_split.save_to_disk("../data/df_tokenized_split")
df_tokenized_split

DatasetDict({
    train: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15993
    })
    eval: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1999
    })
    test: Dataset({
        features: ['id', 'labels', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [7]:
df_tokenized_split["train"].features

{'id': Value(dtype='string', id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}