In [1]:
from pathlib import Path
import json
from datetime import datetime
import re
import pandas as pd

config_path = (
    Path.home()
    / "Documents"
    / "jurimetria_aved_stf"
    / "config"
    / "project_config.json"
)
config = json.loads(config_path.read_text(encoding="utf-8"))

src_dir  = Path(config["paths"]["pdf_txt_dir"]).expanduser()
dst_dir  = Path(config["paths"]["corpus_clean_dir"]).expanduser()
logs_dir = Path(config["paths"]["logs_dir"]).expanduser()

dst_dir.mkdir(parents=True, exist_ok=True)
logs_dir.mkdir(parents=True, exist_ok=True)

BOILERPLATE_PREFIXES = [
    "documento assinado digitalmente",
    "este documento foi assinado digitalmente",
    "conferir autenticidade no endereço",
    "para verificar a autenticidade",
    "pode ser conferida no site",
    "acessado em",
]

BOILERPLATE_EQUALS = {
    "supremo tribunal federal",
    "inteiro teor",
}

def is_boilerplate_line(line: str) -> bool:
    l = line.strip().lower()
    if not l:
        return False
    if l in BOILERPLATE_EQUALS:
        return True
    for pref in BOILERPLATE_PREFIXES:
        if l.startswith(pref):
            return True
    return False

MULTIWORDS = {
    # 8/1 (variações comuns)
    r"8 de janeiro de 2023": "8_de_janeiro_de_2023",
    r"08 de janeiro de 2023": "8_de_janeiro_de_2023",
    r"8 de janeiro": "8_de_janeiro",
    r"08 de janeiro": "8_de_janeiro",
    r"atos antidemocráticos": "atos_antidemocraticos",
    r"ato antidemocrático": "ato_antidemocratico",
    r"invasão de brasília": "invasao_de_brasilia",
    r"invasao de brasilia": "invasao_de_brasilia",
    r"quebra da ordem democrática": "quebra_da_ordem_democratica",
    r"quebra da ordem democratica": "quebra_da_ordem_democratica",
    # estruturas político-institucionais
    r"supremo tribunal federal": "supremo_tribunal_federal",
    r"congresso nacional": "congresso_nacional",
    r"palácio do planalto": "palacio_do_planalto",
    r"palacio do planalto": "palacio_do_planalto",
    r"praça dos três poderes": "praca_dos_tres_poderes",
    r"praca dos tres poderes": "praca_dos_tres_poderes",
    # núcleo de desinformação
    r"fake news": "fake_news",
    r"conteúdo falso": "conteudo_falso",
    r"conteudos falsos": "conteudo_falso",
    r"conteúdo enganoso": "conteudo_enganoso",
    # redes
    r"milícias digitais": "milicias_digitais",
    r"milicias digitais": "milicias_digitais",
    r"rede social": "rede_social",
    r"redes sociais": "redes_sociais",
    r"gabinete do ódio": "gabinete_do_odio",
    r"gabinete do odio": "gabinete_do_odio",
}

def apply_multiwords(text: str) -> str:
    txt = text
    for pat, repl in sorted(MULTIWORDS.items(), key=lambda x: len(x[0]), reverse=True):
        txt = re.sub(pat, repl, txt, flags=re.IGNORECASE)
    return txt

def token_is_heavily_repeated(tok: str) -> bool:
    """
    Considera "ruim" só se:
    - tiver 3 ou mais da mesma letra seguidas
    - e pelo menos 50% dos caracteres fizerem parte dessas repetições
    Assim, 'sessao' passa; 'cooonvocaccao' cai.
    """
    if len(tok) < 6:
        return False
    m = re.findall(r"(.)\1{2,}", tok)
    if not m:
        return False
    repeated_len = 0
    for rep in re.finditer(r"(.)\1{2,}", tok):
        start, end = rep.span()
        repeated_len += (end - start)
    ratio = repeated_len / len(tok)
    return ratio >= 0.5

def normalize_token(tok: str) -> str:
    if token_is_heavily_repeated(tok):
        return re.sub(r"(.)\1{2,}", r"\1\1", tok)
    return tok

def clean_text(raw_text: str) -> str:
    raw_text = raw_text.replace("\r\n", "\n").replace("\r", "\n")
    lines = raw_text.split("\n")

    kept_lines = []
    for ln in lines:
        ln_strip = ln.strip()
        if not ln_strip:
            continue
        if is_boilerplate_line(ln_strip):
            continue
        kept_lines.append(ln_strip)

    if not kept_lines:
        return ""

    txt = " ".join(kept_lines)

    txt = txt.lower()

    txt = apply_multiwords(txt)

    toks = re.split(r"\s+", txt)
    norm_toks = [normalize_token(t) for t in toks if t]

    final_txt = " ".join(norm_toks)
    final_txt = re.sub(r"\s+", " ", final_txt).strip()
    return final_txt

records = []
txt_files = sorted(src_dir.glob("*.txt"))

for txt_path in txt_files:
    raw = txt_path.read_text(encoding="utf-8", errors="ignore")
    original_len = len(raw)

    cleaned = clean_text(raw)
    clean_len = len(cleaned)

    out_path = dst_dir / txt_path.name
    out_path.write_text(cleaned, encoding="utf-8")

    records.append(
        {
            "file_name": txt_path.name,
            "source_path": str(txt_path),
            "clean_path": str(out_path),
            "original_len": original_len,
            "clean_len": clean_len,
            "timestamp": datetime.utcnow().isoformat(),
        }
    )

df = pd.DataFrame(records).sort_values("file_name")
df.to_csv(logs_dir / "cleaning_log.csv", index=False)

df

Unnamed: 0,file_name,source_path,clean_path,original_len,clean_len,timestamp
0,a_002.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,237261,221609,2025-11-01T02:23:13.496174
1,a_003.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,186037,173382,2025-11-01T02:23:13.541014
2,a_004.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,244245,228123,2025-11-01T02:23:13.600361
3,a_005.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,239829,224028,2025-11-01T02:23:13.659313
4,a_006.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,293097,273902,2025-11-01T02:23:13.732022
...,...,...,...,...,...,...
118,a_125.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,234604,214666,2025-11-01T02:23:21.221050
119,a_126.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,384783,354473,2025-11-01T02:23:21.313885
120,a_127.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,3853414,3568540,2025-11-01T02:23:22.265486
121,a_128.txt,/Users/cibelealexandreu/Documents/jurimetria_a...,/Users/cibelealexandreu/Documents/jurimetria_a...,253882,233074,2025-11-01T02:23:22.326710


In [2]:
from pathlib import Path

txt_raw = Path.home() / "Documents" / "jurimetria_aved_stf" / "data" / "01_pdf_txt"
txt_clean = Path.home() / "Documents" / "jurimetria_aved_stf" / "data" / "02_corpus_clean"

len(list(txt_raw.glob("*.txt"))), len(list(txt_clean.glob("*.txt")))

(123, 123)